From b8a2042aa938069e862750553db0e4d82d25822c Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Thu, 28 Dec 2017 23:57:18 +0000
Subject: [PATCH] Vendor import of llvm trunk r321545:
 https://llvm.org/svn/llvm-project/llvm/trunk@321545

---
 include/llvm/Support/KnownBits.h              |   2 -
 .../SelectionDAG/LegalizeVectorOps.cpp        |  17 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  39 ++-
 lib/LTO/LTOModule.cpp                         |  24 +-
 lib/Target/X86/X86ISelLowering.cpp            |  52 ++--
 lib/Target/X86/X86InstrAVX512.td              |  16 --
 lib/Target/X86/X86WinEHState.cpp              |   6 +
 .../PowerPC/combine_loads_from_build_pair.ll  |   2 +
 .../X86/bitcast-int-to-vector-bool-sext.ll    |  15 +-
 .../X86/bitcast-int-to-vector-bool-zext.ll    |  11 +-
 .../CodeGen/X86/bitcast-int-to-vector-bool.ll |  11 +-
 test/CodeGen/X86/setcc-wide-types.ll          | 252 ++++++++++++++++++
 .../X86/win32-eh-available-externally.ll      |  28 ++
 13 files changed, 387 insertions(+), 88 deletions(-)
 create mode 100644 test/CodeGen/X86/win32-eh-available-externally.ll
diff --git a/include/llvm/Support/KnownBits.h b/include/llvm/Support/KnownBits.h
index 7a4de3e5ff12..97e73b13fca3 100644
--- a/include/llvm/Support/KnownBits.h
+++ b/include/llvm/Support/KnownBits.h
@@ -100,13 +100,11 @@ struct KnownBits {
 
   /// Make this value negative.
   void makeNegative() {
-    assert(!isNonNegative() && "Can't make a non-negative value negative");
     One.setSignBit();
   }
 
   /// Make this value negative.
   void makeNonNegative() {
-    assert(!isNegative() && "Can't make a negative value non-negative");
     Zero.setSignBit();
   }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 74970ab5792c..7643790df350 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -49,6 +49,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "legalizevectorops"
+
 namespace {
 
 class VectorLegalizer {
@@ -226,7 +228,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   if (Op.getOpcode() == ISD::LOAD) {
     LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
     ISD::LoadExtType ExtType = LD->getExtensionType();
-    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD)
+    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD) {
+      DEBUG(dbgs() << "\nLegalizing extending vector load: "; Node->dump(&DAG));
       switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0),
                                    LD->getMemoryVT())) {
       default: llvm_unreachable("This action is not supported yet!");
@@ -252,11 +255,14 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         Changed = true;
         return LegalizeOp(ExpandLoad(Op));
       }
+    }
   } else if (Op.getOpcode() == ISD::STORE) {
     StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
     EVT StVT = ST->getMemoryVT();
     MVT ValVT = ST->getValue().getSimpleValueType();
-    if (StVT.isVector() && ST->isTruncatingStore())
+    if (StVT.isVector() && ST->isTruncatingStore()) {
+      DEBUG(dbgs() << "\nLegalizing truncating vector store: ";
+            Node->dump(&DAG));
       switch (TLI.getTruncStoreAction(ValVT, StVT)) {
       default: llvm_unreachable("This action is not supported yet!");
       case TargetLowering::Legal:
@@ -270,6 +276,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
       }
+    }
   } else if (Op.getOpcode() == ISD::MSCATTER || Op.getOpcode() == ISD::MSTORE)
     HasVectorValue = true;
 
@@ -376,6 +383,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     break;
   }
 
+  DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG));
+
   switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
   default: llvm_unreachable("This action is not supported yet!");
   case TargetLowering::Promote:
@@ -383,12 +392,16 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     Changed = true;
     break;
   case TargetLowering::Legal:
+    DEBUG(dbgs() << "Legal node: nothing to do\n");
     break;
   case TargetLowering::Custom: {
+    DEBUG(dbgs() << "Trying custom legalization\n");
     if (SDValue Tmp1 = TLI.LowerOperation(Op, DAG)) {
+      DEBUG(dbgs() << "Successfully custom legalized node\n");
       Result = Tmp1;
       break;
     }
+    DEBUG(dbgs() << "Could not custom legalize node\n");
     LLVM_FALLTHROUGH;
   }
   case TargetLowering::Expand:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index a04c770c51c4..4c8b63d2f239 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5943,7 +5943,9 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
@@ -6043,7 +6045,9 @@ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
@@ -6108,7 +6112,9 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
@@ -6134,7 +6140,9 @@ SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
@@ -6160,7 +6168,9 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
@@ -6189,7 +6199,9 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
@@ -6224,7 +6236,9 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
@@ -6256,7 +6270,9 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
@@ -7112,6 +7128,8 @@ void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
 void SelectionDAG::salvageDebugInfo(SDNode &N) {
   if (!N.getHasDebugValue())
     return;
+
+  SmallVector<SDDbgValue *, 2> ClonedDVs;
   for (auto DV : GetDbgValues(&N)) {
     if (DV->isInvalidated())
       continue;
@@ -7135,13 +7153,16 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
         SDDbgValue *Clone =
             getDbgValue(DV->getVariable(), DIExpr, N0.getNode(), N0.getResNo(),
                         DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
+        ClonedDVs.push_back(Clone);
         DV->setIsInvalidated();
-        AddDbgValue(Clone, N0.getNode(), false);
         DEBUG(dbgs() << "SALVAGE: Rewriting"; N0.getNode()->dumprFull(this);
               dbgs() << " into " << *DIExpr << '\n');
       }
     }
   }
+
+  for (SDDbgValue *Dbg : ClonedDVs)
+    AddDbgValue(Dbg, Dbg->getSDNode(), false);
 }
 
 namespace {
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 51b4f225939f..626d2f5dc813 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -388,24 +388,20 @@ void LTOModule::addDefinedDataSymbol(StringRef Name, const GlobalValue *v) {
   // from the ObjC data structures generated by the front end.
 
   // special case if this data blob is an ObjC class definition
-  std::string Section = v->getSection();
-  if (Section.compare(0, 15, "__OBJC,__class,") == 0) {
-    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-      addObjCClass(gv);
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(v)) {
+    StringRef Section = GV->getSection();
+    if (Section.startswith("__OBJC,__class,")) {
+      addObjCClass(GV);
     }
-  }
 
-  // special case if this data blob is an ObjC category definition
-  else if (Section.compare(0, 18, "__OBJC,__category,") == 0) {
-    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-      addObjCCategory(gv);
+    // special case if this data blob is an ObjC category definition
+    else if (Section.startswith("__OBJC,__category,")) {
+      addObjCCategory(GV);
     }
-  }
 
-  // special case if this data blob is the list of referenced classes
-  else if (Section.compare(0, 18, "__OBJC,__cls_refs,") == 0) {
-    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-      addObjCClassRef(gv);
+    // special case if this data blob is the list of referenced classes
+    else if (Section.startswith("__OBJC,__cls_refs,")) {
+      addObjCClassRef(GV);
     }
   }
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ba3b02e25a9d..9edd799779c7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16281,7 +16281,7 @@ static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
   // Truncate if we had to extend i16/i8 above.
   if (VT != ExtVT) {
     WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
-    SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal);
+    SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
   }
 
   // Extract back to 128/256-bit if we widened.
@@ -18426,7 +18426,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   // Truncate if we had to extend i16/i8 above.
   if (VT != ExtVT) {
     WideVT = MVT::getVectorVT(VTElt, NumElts);
-    V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V);
+    V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
   }
 
   // Extract back to 128/256-bit if we widened.
@@ -18679,6 +18679,14 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
       // Replace chain users with the new chain.
       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+      if (Subtarget.hasVLX()) {
+        // Extract to v4i1/v2i1.
+        SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
+                                      DAG.getIntPtrConstant(0, dl));
+        // Finally, do a normal sign-extend to the desired register.
+        return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
+      }
+
       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
 
@@ -18698,22 +18706,25 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
 
   if (NumElts <= 8) {
     // A subset, assume that we have only AVX-512F
-    unsigned NumBitsToLoad = 8;
-    MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
-    SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
+    SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
                               Ld->getBasePtr(),
                               Ld->getMemOperand());
     // Replace chain users with the new chain.
     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
 
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
-    SDValue BitVec = DAG.getBitcast(MaskVT, Load);
+    SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);
 
     if (NumElts == 8)
       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
 
-      // we should take care to v4i1 and v2i1
+    if (Subtarget.hasVLX()) {
+      // Extract to v4i1/v2i1.
+      SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
+                                    DAG.getIntPtrConstant(0, dl));
+      // Finally, do a normal sign-extend to the desired register.
+      return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
+    }
 
     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
@@ -18728,13 +18739,12 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
                                Ld->getBasePtr(),
                                Ld->getMemOperand());
 
-  SDValue BasePtrHi =
-    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                DAG.getConstant(2, dl, BasePtr.getValueType()));
+  SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
 
-  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
-                               BasePtrHi,
-                               Ld->getMemOperand());
+  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
+                               Ld->getPointerInfo().getWithOffset(2),
+                               MinAlign(Ld->getAlignment(), 2U),
+                               Ld->getMemOperand()->getFlags());
 
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                                  LoadLo.getValue(1), LoadHi.getValue(1));
@@ -34051,15 +34061,14 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
 
     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
     SDValue Load2 =
-        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
-                    std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
+        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                    Ld->getPointerInfo().getWithOffset(16),
+                    MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                              Load1.getValue(1),
                              Load2.getValue(1));
 
-    SDValue NewVec = DAG.getUNDEF(RegVT);
-    NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
-    NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
+    SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
     return DCI.CombineTo(N, NewVec, TF, true);
   }
 
@@ -34465,8 +34474,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
                      Alignment, St->getMemOperand()->getFlags());
     SDValue Ch1 =
-        DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
-                     std::min(16U, Alignment), St->getMemOperand()->getFlags());
+        DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                     St->getPointerInfo().getWithOffset(16),
+                     MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   }
 
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 46c19f18f8d3..dcd84930741b 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -8704,17 +8704,6 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>;
 }
 
-// Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_convert_mask_to_vector_lowering<X86VectorVTInfo X86Info,
-                                                            X86VectorVTInfo _> {
-
-  def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))),
-            (X86Info.VT (EXTRACT_SUBREG
-                           (_.VT (!cast<Instruction>(NAME#"Zrr")
-                             (_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))),
-                           X86Info.SubRegIdx))>;
-}
-
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
 let Predicates = [prd] in
@@ -8724,11 +8713,6 @@ let Predicates = [prd] in
     defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
     defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
   }
-let Predicates = [prd, NoVLX] in {
-   defm Z256_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info256,VTInfo.info512>;
-   defm Z128_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info128,VTInfo.info512>;
-  }
-
 }
 
 defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>;
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 0472a85f50da..6d6dedc60736 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -149,6 +149,12 @@ void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool WinEHStatePass::runOnFunction(Function &F) {
+  // Don't insert state stores or exception handler thunks for
+  // available_externally functions. The handler needs to reference the LSDA,
+  // which will not be emitted in this case.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
   // Check the personality. Do nothing if this personality doesn't use funclets.
   if (!F.hasPersonalityFn())
     return false;
diff --git a/test/CodeGen/PowerPC/combine_loads_from_build_pair.ll b/test/CodeGen/PowerPC/combine_loads_from_build_pair.ll
index 0f8f18a17879..45cc740d1eae 100644
--- a/test/CodeGen/PowerPC/combine_loads_from_build_pair.ll
+++ b/test/CodeGen/PowerPC/combine_loads_from_build_pair.ll
@@ -12,6 +12,8 @@ define i64 @func1(i64 %p1, i64 %p2, i64 %p3, i64 %p4, { i64, i8* } %struct) {
 ; CHECK-DAG:     [[LOBITS:t[0-9]+]]: i32,ch = load<LD4[FixedStack-2]>
 ; CHECK-DAG:     [[HIBITS:t[0-9]+]]: i32,ch = load<LD4[FixedStack-1]>
 ; CHECK: Combining: t{{[0-9]+}}: i64 = build_pair [[LOBITS]], [[HIBITS]]
+; CHECK-NEXT: Creating new node
+; CHECK-SAME: load<LD8[FixedStack-1]
 ; CHECK-NEXT: into
 ; CHECK-SAME: load<LD8[FixedStack-1]
 ; CHECK-LABEL: Optimized lowered selection DAG:
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index dcddb8e82642..6ef2be99dee5 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -48,9 +48,8 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
 ; AVX512-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %zmm0
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
   %1 = bitcast i2 %a0 to <2 x i1>
   %2 = sext <2 x i1> %1 to <2 x i64>
@@ -91,10 +90,8 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) {
 ; AVX512-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
   %1 = bitcast i4 %a0 to <4 x i1>
   %2 = sext <4 x i1> %1 to <4 x i32>
@@ -246,8 +243,8 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
 ; AVX512-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    # kill: def %ymm0 killed %ymm0 killed %zmm0
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; AVX512-NEXT:    retq
   %1 = bitcast i4 %a0 to <4 x i1>
   %2 = sext <4 x i1> %1 to <4 x i64>
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index f88b540323cb..9e77cd11449e 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -63,9 +63,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
 ; AVX512VLBW-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512VLBW-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512VLBW-NEXT:    kmovd %eax, %k1
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512VLBW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %zmm0
-; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
 ; AVX512VLBW-NEXT:    retq
   %1 = bitcast i2 %a0 to <2 x i1>
   %2 = zext <2 x i1> %1 to <2 x i64>
@@ -120,9 +118,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) {
 ; AVX512VLBW-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512VLBW-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512VLBW-NEXT:    kmovd %eax, %k1
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; AVX512VLBW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
 ; AVX512VLBW-NEXT:    retq
   %1 = bitcast i4 %a0 to <4 x i1>
   %2 = zext <4 x i1> %1 to <4 x i32>
@@ -317,8 +313,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
 ; AVX512VLBW-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512VLBW-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512VLBW-NEXT:    kmovd %eax, %k1
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512VLBW-NEXT:    # kill: def %ymm0 killed %ymm0 killed %zmm0
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm0 {%k1} {z}
 ; AVX512VLBW-NEXT:    retq
   %1 = bitcast i4 %a0 to <4 x i1>
   %2 = zext <4 x i1> %1 to <4 x i64>
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index 6d9f832d861f..45a48fae146d 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -46,9 +46,8 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) {
 ; AVX512-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %zmm0
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
   %1 = bitcast i2 %a0 to <2 x i1>
   ret <2 x i1> %1
@@ -90,10 +89,8 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) {
 ; AVX512-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
   %1 = bitcast i4 %a0 to <4 x i1>
   ret <4 x i1> %1
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index f935db72dcb9..410378ffbad2 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -138,3 +138,255 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
   ret i32 %zext
 }
 
+; This test models the expansion of 'memcmp(a, b, 32) != 0' 
+; if we allowed 2 pairs of 16-byte loads per block.
+
+define i32 @ne_i128_pair(i128* %a, i128* %b) {
+; SSE2-LABEL: ne_i128_pair:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %rcx
+; SSE2-NEXT:    xorq (%rsi), %rax
+; SSE2-NEXT:    xorq 8(%rsi), %rcx
+; SSE2-NEXT:    movq 24(%rdi), %rdx
+; SSE2-NEXT:    movq 16(%rdi), %rdi
+; SSE2-NEXT:    xorq 16(%rsi), %rdi
+; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    xorq 24(%rsi), %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: ne_i128_pair:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rax
+; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    xorq (%rsi), %rax
+; AVX2-NEXT:    xorq 8(%rsi), %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    movq 16(%rdi), %rdi
+; AVX2-NEXT:    xorq 16(%rsi), %rdi
+; AVX2-NEXT:    orq %rax, %rdi
+; AVX2-NEXT:    xorq 24(%rsi), %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    orq %rdi, %rdx
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    retq
+  %a0 = load i128, i128* %a
+  %b0 = load i128, i128* %b
+  %xor1 = xor i128 %a0, %b0
+  %ap1 = getelementptr i128, i128* %a, i128 1
+  %bp1 = getelementptr i128, i128* %b, i128 1
+  %a1 = load i128, i128* %ap1
+  %b1 = load i128, i128* %bp1
+  %xor2 = xor i128 %a1, %b1
+  %or = or i128 %xor1, %xor2
+  %cmp = icmp ne i128 %or, 0
+  %z = zext i1 %cmp to i32
+  ret i32 %z
+}
+
+; This test models the expansion of 'memcmp(a, b, 32) == 0' 
+; if we allowed 2 pairs of 16-byte loads per block.
+
+define i32 @eq_i128_pair(i128* %a, i128* %b) {
+; SSE2-LABEL: eq_i128_pair:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %rcx
+; SSE2-NEXT:    xorq (%rsi), %rax
+; SSE2-NEXT:    xorq 8(%rsi), %rcx
+; SSE2-NEXT:    movq 24(%rdi), %rdx
+; SSE2-NEXT:    movq 16(%rdi), %rdi
+; SSE2-NEXT:    xorq 16(%rsi), %rdi
+; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    xorq 24(%rsi), %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: eq_i128_pair:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rax
+; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    xorq (%rsi), %rax
+; AVX2-NEXT:    xorq 8(%rsi), %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    movq 16(%rdi), %rdi
+; AVX2-NEXT:    xorq 16(%rsi), %rdi
+; AVX2-NEXT:    orq %rax, %rdi
+; AVX2-NEXT:    xorq 24(%rsi), %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    orq %rdi, %rdx
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    retq
+  %a0 = load i128, i128* %a
+  %b0 = load i128, i128* %b
+  %xor1 = xor i128 %a0, %b0
+  %ap1 = getelementptr i128, i128* %a, i128 1
+  %bp1 = getelementptr i128, i128* %b, i128 1
+  %a1 = load i128, i128* %ap1
+  %b1 = load i128, i128* %bp1
+  %xor2 = xor i128 %a1, %b1
+  %or = or i128 %xor1, %xor2
+  %cmp = icmp eq i128 %or, 0
+  %z = zext i1 %cmp to i32
+  ret i32 %z
+}
+
+; This test models the expansion of 'memcmp(a, b, 64) != 0' 
+; if we allowed 2 pairs of 32-byte loads per block.
+
+define i32 @ne_i256_pair(i256* %a, i256* %b) {
+; SSE2-LABEL: ne_i256_pair:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq 16(%rdi), %r9
+; SSE2-NEXT:    movq 24(%rdi), %r11
+; SSE2-NEXT:    movq (%rdi), %r8
+; SSE2-NEXT:    movq 8(%rdi), %r10
+; SSE2-NEXT:    xorq 8(%rsi), %r10
+; SSE2-NEXT:    xorq 24(%rsi), %r11
+; SSE2-NEXT:    xorq (%rsi), %r8
+; SSE2-NEXT:    xorq 16(%rsi), %r9
+; SSE2-NEXT:    movq 48(%rdi), %rdx
+; SSE2-NEXT:    movq 32(%rdi), %rax
+; SSE2-NEXT:    movq 56(%rdi), %rcx
+; SSE2-NEXT:    movq 40(%rdi), %rdi
+; SSE2-NEXT:    xorq 40(%rsi), %rdi
+; SSE2-NEXT:    xorq 56(%rsi), %rcx
+; SSE2-NEXT:    orq %r11, %rcx
+; SSE2-NEXT:    orq %rdi, %rcx
+; SSE2-NEXT:    orq %r10, %rcx
+; SSE2-NEXT:    xorq 32(%rsi), %rax
+; SSE2-NEXT:    xorq 48(%rsi), %rdx
+; SSE2-NEXT:    orq %r9, %rdx
+; SSE2-NEXT:    orq %rax, %rdx
+; SSE2-NEXT:    orq %r8, %rdx
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: ne_i256_pair:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq 16(%rdi), %r9
+; AVX2-NEXT:    movq 24(%rdi), %r11
+; AVX2-NEXT:    movq (%rdi), %r8
+; AVX2-NEXT:    movq 8(%rdi), %r10
+; AVX2-NEXT:    xorq 8(%rsi), %r10
+; AVX2-NEXT:    xorq 24(%rsi), %r11
+; AVX2-NEXT:    xorq (%rsi), %r8
+; AVX2-NEXT:    xorq 16(%rsi), %r9
+; AVX2-NEXT:    movq 48(%rdi), %rdx
+; AVX2-NEXT:    movq 32(%rdi), %rax
+; AVX2-NEXT:    movq 56(%rdi), %rcx
+; AVX2-NEXT:    movq 40(%rdi), %rdi
+; AVX2-NEXT:    xorq 40(%rsi), %rdi
+; AVX2-NEXT:    xorq 56(%rsi), %rcx
+; AVX2-NEXT:    orq %r11, %rcx
+; AVX2-NEXT:    orq %rdi, %rcx
+; AVX2-NEXT:    orq %r10, %rcx
+; AVX2-NEXT:    xorq 32(%rsi), %rax
+; AVX2-NEXT:    xorq 48(%rsi), %rdx
+; AVX2-NEXT:    orq %r9, %rdx
+; AVX2-NEXT:    orq %rax, %rdx
+; AVX2-NEXT:    orq %r8, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    retq
+  %a0 = load i256, i256* %a
+  %b0 = load i256, i256* %b
+  %xor1 = xor i256 %a0, %b0
+  %ap1 = getelementptr i256, i256* %a, i256 1
+  %bp1 = getelementptr i256, i256* %b, i256 1
+  %a1 = load i256, i256* %ap1
+  %b1 = load i256, i256* %bp1
+  %xor2 = xor i256 %a1, %b1
+  %or = or i256 %xor1, %xor2
+  %cmp = icmp ne i256 %or, 0
+  %z = zext i1 %cmp to i32
+  ret i32 %z
+}
+
+; This test models the expansion of 'memcmp(a, b, 64) == 0' 
+; if we allowed 2 pairs of 32-byte loads per block.
+
+define i32 @eq_i256_pair(i256* %a, i256* %b) {
+; SSE2-LABEL: eq_i256_pair:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq 16(%rdi), %r9
+; SSE2-NEXT:    movq 24(%rdi), %r11
+; SSE2-NEXT:    movq (%rdi), %r8
+; SSE2-NEXT:    movq 8(%rdi), %r10
+; SSE2-NEXT:    xorq 8(%rsi), %r10
+; SSE2-NEXT:    xorq 24(%rsi), %r11
+; SSE2-NEXT:    xorq (%rsi), %r8
+; SSE2-NEXT:    xorq 16(%rsi), %r9
+; SSE2-NEXT:    movq 48(%rdi), %rdx
+; SSE2-NEXT:    movq 32(%rdi), %rax
+; SSE2-NEXT:    movq 56(%rdi), %rcx
+; SSE2-NEXT:    movq 40(%rdi), %rdi
+; SSE2-NEXT:    xorq 40(%rsi), %rdi
+; SSE2-NEXT:    xorq 56(%rsi), %rcx
+; SSE2-NEXT:    orq %r11, %rcx
+; SSE2-NEXT:    orq %rdi, %rcx
+; SSE2-NEXT:    orq %r10, %rcx
+; SSE2-NEXT:    xorq 32(%rsi), %rax
+; SSE2-NEXT:    xorq 48(%rsi), %rdx
+; SSE2-NEXT:    orq %r9, %rdx
+; SSE2-NEXT:    orq %rax, %rdx
+; SSE2-NEXT:    orq %r8, %rdx
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: eq_i256_pair:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq 16(%rdi), %r9
+; AVX2-NEXT:    movq 24(%rdi), %r11
+; AVX2-NEXT:    movq (%rdi), %r8
+; AVX2-NEXT:    movq 8(%rdi), %r10
+; AVX2-NEXT:    xorq 8(%rsi), %r10
+; AVX2-NEXT:    xorq 24(%rsi), %r11
+; AVX2-NEXT:    xorq (%rsi), %r8
+; AVX2-NEXT:    xorq 16(%rsi), %r9
+; AVX2-NEXT:    movq 48(%rdi), %rdx
+; AVX2-NEXT:    movq 32(%rdi), %rax
+; AVX2-NEXT:    movq 56(%rdi), %rcx
+; AVX2-NEXT:    movq 40(%rdi), %rdi
+; AVX2-NEXT:    xorq 40(%rsi), %rdi
+; AVX2-NEXT:    xorq 56(%rsi), %rcx
+; AVX2-NEXT:    orq %r11, %rcx
+; AVX2-NEXT:    orq %rdi, %rcx
+; AVX2-NEXT:    orq %r10, %rcx
+; AVX2-NEXT:    xorq 32(%rsi), %rax
+; AVX2-NEXT:    xorq 48(%rsi), %rdx
+; AVX2-NEXT:    orq %r9, %rdx
+; AVX2-NEXT:    orq %rax, %rdx
+; AVX2-NEXT:    orq %r8, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    retq
+  %a0 = load i256, i256* %a
+  %b0 = load i256, i256* %b
+  %xor1 = xor i256 %a0, %b0
+  %ap1 = getelementptr i256, i256* %a, i256 1
+  %bp1 = getelementptr i256, i256* %b, i256 1
+  %a1 = load i256, i256* %ap1
+  %b1 = load i256, i256* %bp1
+  %xor2 = xor i256 %a1, %b1
+  %or = or i256 %xor1, %xor2
+  %cmp = icmp eq i256 %or, 0
+  %z = zext i1 %cmp to i32
+  ret i32 %z
+}
+
diff --git a/test/CodeGen/X86/win32-eh-available-externally.ll b/test/CodeGen/X86/win32-eh-available-externally.ll
new file mode 100644
index 000000000000..49da191de978
--- /dev/null
+++ b/test/CodeGen/X86/win32-eh-available-externally.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -x86-winehstate < %s | FileCheck %s --check-prefix=IR
+; RUN: llc < %s | FileCheck %s --check-prefix=ASM
+
+; IR-NOT: define.*__ehhandler
+; IR: define available_externally void @foo(void ()*)
+; IR-NOT: define.*__ehhandler
+
+; No code should be emitted.
+; ASM-NOT: __ehtable
+; ASM-NOT: __ehhandler
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+declare i32 @__CxxFrameHandler3(...) unnamed_addr
+
+define available_externally void @foo(void ()*) personality i32 (...)* @__CxxFrameHandler3 {
+start:
+  invoke void %0()
+          to label %good unwind label %bad
+
+good:                                             ; preds = %start
+  ret void
+
+bad:                                              ; preds = %start
+  %cleanuppad = cleanuppad within none []
+  cleanupret from %cleanuppad unwind to caller
+}