Vendor import of llvm RELEASE_360/rc4 tag r229772 (effectively, 3.6.0 RC4):

https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/rc4@229772
2015-02-19 20:55:17 +00:00 · 2015-02-19 20:55:17 +00:00 · 49b6407b6c
commit 49b6407b6c
parent 85d2764eab
36 changed files with 643 additions and 123 deletions
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -5,11 +5,6 @@ LLVM 3.6 Release Notes
 .. contents::
    :local:
 .. warning::
   These are in-progress notes for the upcoming LLVM 3.6 release.  You may
   prefer the `LLVM 3.5 Release Notes <http://llvm.org/releases/3.5.0/docs
   /ReleaseNotes.html>`_.
 Introduction
 ============
@ -26,10 +21,6 @@ have questions or comments, the `LLVM Developer's Mailing List
 <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ is a good place to send
 them.
 Note that if you are reading this file from a Subversion checkout or the main
 LLVM web page, this document applies to the *next* release, not the current
 one.  To see the release notes for a specific release, please see the `releases
 page <http://llvm.org/releases/>`_.
 Non-comprehensive list of changes in this release
 =================================================
@ -544,6 +535,33 @@ new LLVM-based code generators "on the fly" for the designed processors and
 loads them in to the compiler backend as runtime libraries to avoid
 per-target recompilation of larger parts of the compiler chain. 
 Likely
 ------
 `Likely <http://www.liblikely.org>`_ is an embeddable just-in-time Lisp for
 image recognition and heterogenous computing. Algorithms are just-in-time
 compiled using LLVM's MCJIT infrastructure to execute on single or
 multi-threaded CPUs and potentially OpenCL SPIR or CUDA enabled GPUs.
 Likely seeks to explore new optimizations for statistical learning 
 algorithms by moving them from an offline model generation step to the 
 compile-time evaluation of a function (the learning algorithm) with constant
 arguments (the training data).
 LDC - the LLVM-based D compiler
 -------------------------------
 `D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
 pragmatically combines efficiency, control, and modeling power, with safety and
 programmer productivity. D supports powerful concepts like Compile-Time Function
 Execution (CTFE) and Template Meta-Programming, provides an innovative approach
 to concurrency and offers many classical paradigms.
 `LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
 combined with LLVM as backend to produce efficient native code. LDC targets
 x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on
 PowerPC (32/64 bit). Ports to other architectures like ARM, AArch64 and MIPS64
 are underway.
 Additional Information
 ======================
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,11 +1,6 @@
 Overview
 ========
 .. warning::
   If you are using a released version of LLVM, see `the download page
   <http://llvm.org/releases/>`_ to find your documentation.
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
--- a/examples/Kaleidoscope/Chapter4/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter4/CMakeLists.txt
@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
  ExecutionEngine
  InstCombine
  MC
  RuntimeDyld
  ScalarOpts
  Support
  native
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@ -867,9 +867,11 @@ class SelectionDAG {
                           SDValue Offset, ISD::MemIndexedMode AM);
  SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
-                        SDValue Mask, SDValue Src0, MachineMemOperand *MMO);
+                        SDValue Mask, SDValue Src0, EVT MemVT,
                        MachineMemOperand *MMO, ISD::LoadExtType);
  SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
-                         SDValue Ptr, SDValue Mask, MachineMemOperand *MMO);
+                         SDValue Ptr, SDValue Mask, EVT MemVT,
                         MachineMemOperand *MMO, bool IsTrunc);
  /// getSrcValue - Construct a node to track a Value* through the backend.
  SDValue getSrcValue(const Value *v);
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@ -1970,13 +1970,17 @@ class MaskedLoadStoreSDNode : public MemSDNode {
 class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
 public:
  friend class SelectionDAG;
-  MaskedLoadSDNode(unsigned Order, DebugLoc dl,
+  MaskedLoadSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
-                   SDValue *Operands, unsigned numOperands, 
+                   unsigned numOperands, SDVTList VTs, ISD::LoadExtType ETy,
-                   SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+                   EVT MemVT, MachineMemOperand *MMO)
    : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands,
-                            VTs, MemVT, MMO) 
+                            VTs, MemVT, MMO) {
-  {}
+    SubclassData |= (unsigned short)ETy;
  }
  ISD::LoadExtType getExtensionType() const {
    return ISD::LoadExtType(SubclassData & 3);
  } 
  const SDValue &getSrc0() const { return getOperand(3); }
  static bool classof(const SDNode *N) {
    return N->getOpcode() == ISD::MLOAD;
@ -1989,14 +1993,19 @@ class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
 public:
  friend class SelectionDAG;
-  MaskedStoreSDNode(unsigned Order, DebugLoc dl,
+  MaskedStoreSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
-                   SDValue *Operands, unsigned numOperands, 
+                    unsigned numOperands, SDVTList VTs, bool isTrunc, EVT MemVT,
-                   SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+                    MachineMemOperand *MMO)
    : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands,
-                            VTs, MemVT, MMO) 
+                            VTs, MemVT, MMO) {
-  {}
+      SubclassData |= (unsigned short)isTrunc;
  }
  /// isTruncatingStore - Return true if the op does a truncation before store.
  /// For integers this is the same as doing a TRUNCATE and storing the result.
  /// For floats, it is the same as doing an FP_ROUND and storing the result.
  bool isTruncatingStore() const { return SubclassData & 1; }
-  const SDValue &getData() const { return getOperand(3); }
+  const SDValue &getValue() const { return getOperand(3); }
  static bool classof(const SDNode *N) {
    return N->getOpcode() == ISD::MSTORE;
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@ -6,9 +6,6 @@
 /* Exported configuration */
 #include "llvm/Config/llvm-config.h"
 /* Patch version of the LLVM API */
 #cmakedefine LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}
 /* Bug report URL. */
 #define BUG_REPORT_URL "${BUG_REPORT_URL}"
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@ -87,10 +87,13 @@
 #cmakedefine LLVM_USE_OPROFILE 1
 /* Major version of the LLVM API */
-#cmakedefine LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
+#define LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
 /* Minor version of the LLVM API */
-#cmakedefine LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
+#define LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
 /* Patch version of the LLVM API */
 #define LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}
 /* LLVM version string */
 #define LLVM_VERSION_STRING "${PACKAGE_VERSION}"
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in
@ -92,6 +92,9 @@
 /* Minor version of the LLVM API */
 #undef LLVM_VERSION_MINOR
 /* Patch version of the LLVM API */
 #undef LLVM_VERSION_PATCH
 /* LLVM version string */
 #undef LLVM_VERSION_STRING
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@ -325,6 +325,9 @@ class ConstantAggregateZero : public Constant {
  /// index.
  Constant *getElementValue(unsigned Idx) const;
  /// \brief Return the number of elements in the array, vector, or struct.
  unsigned getNumElements() const;
  /// Methods for support type inquiry through isa, cast, and dyn_cast:
  ///
  static bool classof(const Value *V) {
@ -1196,6 +1199,9 @@ class UndefValue : public Constant {
  /// index.
  UndefValue *getElementValue(unsigned Idx) const;
  /// \brief Return the number of elements in the array, vector, or struct.
  unsigned getNumElements() const;
  void destroyConstant() override;
  /// Methods for support type inquiry through isa, cast, and dyn_cast:
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@ -538,9 +538,17 @@ Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
  if (Metadata *MD = MDValuePtrs[Idx])
    return MD;
-  // Create and return a placeholder, which will later be RAUW'd.
+  // Track forward refs to be resolved later.
-  AnyFwdRefs = true;
+  if (AnyFwdRefs) {
    MinFwdRef = std::min(MinFwdRef, Idx);
    MaxFwdRef = std::max(MaxFwdRef, Idx);
  } else {
    AnyFwdRefs = true;
    MinFwdRef = MaxFwdRef = Idx;
  }
  ++NumFwdRefs;
  // Create and return a placeholder, which will later be RAUW'd.
  Metadata *MD = MDNode::getTemporary(Context, None);
  MDValuePtrs[Idx].reset(MD);
  return MD;
@ -556,11 +564,15 @@ void BitcodeReaderMDValueList::tryToResolveCycles() {
    return;
  // Resolve any cycles.
-  for (auto &MD : MDValuePtrs) {
+  for (unsigned I = MinFwdRef, E = MaxFwdRef + 1; I != E; ++I) {
    auto &MD = MDValuePtrs[I];
    assert(!(MD && isa<MDNodeFwdDecl>(MD)) && "Unexpected forward reference");
    if (auto *N = dyn_cast_or_null<UniquableMDNode>(MD))
      N->resolveCycles();
  }
  // Make sure we return early again until there's another forward ref.
  AnyFwdRefs = false;
 }
 Type *BitcodeReader::getTypeByID(unsigned ID) {
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@ -99,6 +99,8 @@ class BitcodeReaderValueList {
 class BitcodeReaderMDValueList {
  unsigned NumFwdRefs;
  bool AnyFwdRefs;
  unsigned MinFwdRef;
  unsigned MaxFwdRef;
  std::vector<TrackingMDRef> MDValuePtrs;
  LLVMContext &Context;
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -4842,7 +4842,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
  MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
  SDValue Mask = MST->getMask();
-  SDValue Data  = MST->getData();
+  SDValue Data  = MST->getValue();
  SDLoc DL(N);
  // If the MSTORE data type requires splitting and the mask is provided by a
@ -4885,7 +4885,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
                           Alignment, MST->getAAInfo(), MST->getRanges());
-    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO);
+    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
                            MST->isTruncatingStore());
    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@ -4897,7 +4898,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
                           SecondHalfAlignment, MST->getAAInfo(),
                           MST->getRanges());
-    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO);
+    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
                            MST->isTruncatingStore());
    AddToWorklist(Lo.getNode());
    AddToWorklist(Hi.getNode());
@ -4958,7 +4960,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                         Alignment, MLD->getAAInfo(), MLD->getRanges());
-    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO);
+    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
                           ISD::NON_EXTLOAD);
    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@ -4969,7 +4972,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
-    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO);
+    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
                           ISD::NON_EXTLOAD);
    AddToWorklist(Lo.getNode());
    AddToWorklist(Hi.getNode());
@ -9482,6 +9486,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
    unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
    // The narrowing should be profitable, the load/store operation should be
    // legal (or custom) and the store size should be equal to the NewVT width.
    while (NewBW < BitWidth &&
           !(TLI.isOperationLegalOrCustom(Opc, NewVT) &&
             TLI.isNarrowingProfitable(VT, NewVT))) {
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -458,16 +458,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
  SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
-  SDValue ExtMask = PromoteTargetBoolean(N->getMask(), NVT);
+
  SDValue Mask = N->getMask();
  EVT NewMaskVT = getSetCCResultType(NVT);
  if (NewMaskVT != N->getMask().getValueType())
    Mask = PromoteTargetBoolean(Mask, NewMaskVT);
  SDLoc dl(N);
  MachineMemOperand *MMO = DAG.getMachineFunction().
    getMachineMemOperand(N->getPointerInfo(),
                         MachineMemOperand::MOLoad,  NVT.getStoreSize(),
                         N->getAlignment(), N->getAAInfo(), N->getRanges());
  SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
-                                  ExtMask, ExtSrc0, MMO);
+                                  Mask, ExtSrc0, N->getMemoryVT(),
                                  N->getMemOperand(), ISD::SEXTLOAD);
  // Legalized the chain result - switch anything that used the old chain to
  // use the new one.
  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@ -1117,16 +1117,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
  assert(OpNo == 2 && "Only know how to promote the mask!");
-  SDValue DataOp = N->getData();
+  SDValue DataOp = N->getValue();
  EVT DataVT = DataOp.getValueType();
  SDValue Mask = N->getMask();
  EVT MaskVT = Mask.getValueType();
  SDLoc dl(N);
  bool TruncateStore = false;
  if (!TLI.isTypeLegal(DataVT)) {
    if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
      DataOp = GetPromotedInteger(DataOp);
      Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
      TruncateStore = true;
    }
    else {
      assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
@ -1156,10 +1158,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpN
  }
  else
    Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
-  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
-  NewOps[2] = Mask;
+                            N->getMemoryVT(), N->getMemOperand(),
-  NewOps[3] = DataOp;
+                            TruncateStore);
  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -659,6 +659,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
  SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
  SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
  SDValue WidenVecOp_STORE(SDNode* N);
  SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
  SDValue WidenVecOp_SETCC(SDNode* N);
  SDValue WidenVecOp_Convert(SDNode *N);
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -992,6 +992,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
  SDValue Ptr = MLD->getBasePtr();
  SDValue Mask = MLD->getMask();
  unsigned Alignment = MLD->getOriginalAlignment();
  ISD::LoadExtType ExtType = MLD->getExtensionType();
  // if Alignment is equal to the vector size,
  // take the half of it for the second part
@ -1015,7 +1016,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                         Alignment, MLD->getAAInfo(), MLD->getRanges());
-  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, MMO);
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
                         ExtType);
  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@ -1026,7 +1028,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
-  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, MMO);
+  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
                         ExtType);
  // Build a factor node to remember that this load is independent of the
@ -1464,7 +1467,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
  SDValue Ch  = N->getChain();
  SDValue Ptr = N->getBasePtr();
  SDValue Mask = N->getMask();
-  SDValue Data = N->getData();
+  SDValue Data = N->getValue();
  EVT MemoryVT = N->getMemoryVT();
  unsigned Alignment = N->getOriginalAlignment();
  SDLoc DL(N);
@ -1489,7 +1492,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                         Alignment, N->getAAInfo(), N->getRanges());
-  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, MMO);
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
                          N->isTruncatingStore());
  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@ -1500,7 +1504,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
                         SecondHalfAlignment, N->getAAInfo(), N->getRanges());
-  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, MMO);
+  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
                          N->isTruncatingStore());
  // Build a factor node to remember that this store is independent of the
@ -2412,6 +2417,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
  SDValue Mask = N->getMask();
  EVT MaskVT = Mask.getValueType();
  SDValue Src0 = GetWidenedVector(N->getSrc0());
  ISD::LoadExtType ExtType = N->getExtensionType();
  SDLoc dl(N);
  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
@ -2434,14 +2440,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
  }
  // Rebuild memory operand because MemoryVT was changed
  MachineMemOperand *MMO = DAG.getMachineFunction().
    getMachineMemOperand(N->getPointerInfo(),
                         MachineMemOperand::MOLoad,  WidenVT.getStoreSize(),
                         N->getAlignment(), N->getAAInfo(), N->getRanges());
  SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
-                                  Mask, Src0, MMO);
+                                  Mask, Src0, N->getMemoryVT(),
                                  N->getMemOperand(), ExtType);
  // Legalized the chain result - switch anything that used the old chain to
  // use the new one.
  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@ -2593,6 +2594,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
  case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
  case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
  case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
  case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
  case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
  case ISD::ANY_EXTEND:
@ -2791,6 +2793,42 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
    return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
 }
 SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
  SDValue Mask = MST->getMask();
  EVT MaskVT = Mask.getValueType();
  SDValue StVal = MST->getValue();
  // Widen the value
  SDValue WideVal = GetWidenedVector(StVal);
  SDLoc dl(N);
  if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
    Mask = GetWidenedVector(Mask);
  else {
    // The mask should be widened as well
    EVT BoolVT = getSetCCResultType(WideVal.getValueType());
    // We can't use ModifyToType() because we should fill the mask with
    // zeroes
    unsigned WidenNumElts = BoolVT.getVectorNumElements();
    unsigned MaskNumElts = MaskVT.getVectorNumElements();
    unsigned NumConcat = WidenNumElts / MaskNumElts;
    SmallVector<SDValue, 16> Ops(NumConcat);
    SDValue ZeroVal = DAG.getConstant(0, MaskVT);
    Ops[0] = Mask;
    for (unsigned i = 1; i != NumConcat; ++i)
      Ops[i] = ZeroVal;
    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
  }
  assert(Mask.getValueType().getVectorNumElements() ==
         WideVal.getValueType().getVectorNumElements() &&
         "Mask and data vectors should have the same number of elements");
  return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
                            Mask, MST->getMemoryVT(), MST->getMemOperand(),
                            false);
 }
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
  SDValue InOp0 = GetWidenedVector(N->getOperand(0));
  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -4924,15 +4924,15 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
 SDValue
 SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
-                            SDValue Ptr, SDValue Mask, SDValue Src0,
+                            SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT,
-                            MachineMemOperand *MMO) {
+                            MachineMemOperand *MMO, ISD::LoadExtType ExtTy) {
  SDVTList VTs = getVTList(VT, MVT::Other);
  SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
  FoldingSetNodeID ID;
  AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
  ID.AddInteger(VT.getRawBits());
-  ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED,
+  ID.AddInteger(encodeMemSDNodeFlags(ExtTy, ISD::UNINDEXED,
                                     MMO->isVolatile(),
                                     MMO->isNonTemporal(),
                                     MMO->isInvariant()));
@ -4944,14 +4944,15 @@ SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
  }
  SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
                                             dl.getDebugLoc(), Ops, 4, VTs,
-                                             VT, MMO);
+                                             ExtTy, MemVT, MMO);
  CSEMap.InsertNode(N, IP);
  InsertNode(N);
  return SDValue(N, 0);
 }
 SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
-                               SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) {
+                                     SDValue Ptr, SDValue Mask, EVT MemVT,
                                     MachineMemOperand *MMO, bool isTrunc) {
  assert(Chain.getValueType() == MVT::Other &&
        "Invalid chain type");
  EVT VT = Val.getValueType();
@ -4970,7 +4971,7 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
  }
  SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
                                                    dl.getDebugLoc(), Ops, 4,
-                                                    VTs, VT, MMO);
+                                                    VTs, isTrunc, MemVT, MMO);
  CSEMap.InsertNode(N, IP);
  InsertNode(N);
  return SDValue(N, 0);
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -3667,7 +3667,8 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
    getMachineMemOperand(MachinePointerInfo(PtrOperand),
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
                          Alignment, AAInfo);
-  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO);
+  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
                                         MMO, false);
  DAG.setRoot(StoreNode);
  setValue(&I, StoreNode);
 }
@ -3706,7 +3707,8 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
                          MachineMemOperand::MOLoad,  VT.getStoreSize(),
                          Alignment, AAInfo, Ranges);
-  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO);
+  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
                                   ISD::NON_EXTLOAD);
  SDValue OutChain = Load.getValue(1);
  DAG.setRoot(OutChain);
  setValue(&I, Load);
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@ -4,7 +4,6 @@ add_llvm_library(LLVMExecutionEngine
  ExecutionEngine.cpp
  ExecutionEngineBindings.cpp
  GDBRegistrationListener.cpp
  RTDyldMemoryManager.cpp
  TargetSelect.cpp
  )
--- a/lib/ExecutionEngine/LLVMBuild.txt
+++ b/lib/ExecutionEngine/LLVMBuild.txt
@ -22,4 +22,4 @@ subdirectories = Interpreter MCJIT RuntimeDyld IntelJITEvents OProfileJIT
 type = Library
 name = ExecutionEngine
 parent = Libraries
-required_libraries = Core MC Object Support
+required_libraries = Core MC Object Support RuntimeDyld
--- a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
+++ b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
@ -1,4 +1,5 @@
 add_llvm_library(LLVMRuntimeDyld
  RTDyldMemoryManager.cpp
  RuntimeDyld.cpp
  RuntimeDyldChecker.cpp
  RuntimeDyldELF.cpp
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@ -257,11 +257,11 @@ Constant *Constant::getAggregateElement(unsigned Elt) const {
  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
    return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : nullptr;
-  if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this))
+  if (const ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(this))
-    return CAZ->getElementValue(Elt);
+    return Elt < CAZ->getNumElements() ? CAZ->getElementValue(Elt) : nullptr;
  if (const UndefValue *UV = dyn_cast<UndefValue>(this))
-    return UV->getElementValue(Elt);
+    return Elt < UV->getNumElements() ? UV->getElementValue(Elt) : nullptr;
  if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
    return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt)
@ -764,6 +764,14 @@ Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
  return getStructElement(Idx);
 }
 unsigned ConstantAggregateZero::getNumElements() const {
  const Type *Ty = getType();
  if (const auto *AT = dyn_cast<ArrayType>(Ty))
    return AT->getNumElements();
  if (const auto *VT = dyn_cast<VectorType>(Ty))
    return VT->getNumElements();
  return Ty->getStructNumElements();
 }
 //===----------------------------------------------------------------------===//
 //                         UndefValue Implementation
@ -797,7 +805,14 @@ UndefValue *UndefValue::getElementValue(unsigned Idx) const {
  return getStructElement(Idx);
 }
-
+unsigned UndefValue::getNumElements() const {
  const Type *Ty = getType();
  if (const auto *AT = dyn_cast<ArrayType>(Ty))
    return AT->getNumElements();
  if (const auto *VT = dyn_cast<VectorType>(Ty))
    return VT->getNumElements();
  return Ty->getStructNumElements();
 }
 //===----------------------------------------------------------------------===//
 //                            ConstantXXX Classes
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -1679,7 +1679,9 @@ void X86TargetLowering::resetOperationActions() {
  setTargetDAGCombine(ISD::FMA);
  setTargetDAGCombine(ISD::SUB);
  setTargetDAGCombine(ISD::LOAD);
  setTargetDAGCombine(ISD::MLOAD);
  setTargetDAGCombine(ISD::STORE);
  setTargetDAGCombine(ISD::MSTORE);
  setTargetDAGCombine(ISD::ZERO_EXTEND);
  setTargetDAGCombine(ISD::ANY_EXTEND);
  setTargetDAGCombine(ISD::SIGN_EXTEND);
@ -24738,6 +24740,166 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }
 /// PerformMLOADCombine - Resolve extending loads
 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
  if (Mld->getExtensionType() != ISD::SEXTLOAD)
    return SDValue();
  EVT VT = Mld->getValueType(0);
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  unsigned NumElems = VT.getVectorNumElements();
  EVT LdVT = Mld->getMemoryVT();
  SDLoc dl(Mld);
  assert(LdVT != VT && "Cannot extend to the same type");
  unsigned ToSz = VT.getVectorElementType().getSizeInBits();
  unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
  // From, To sizes and ElemCount must be pow of two
  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
    "Unexpected size for extending masked load");
  unsigned SizeRatio  = ToSz / FromSz;
  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
  // Create a type on which we perform the shuffle
  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
          LdVT.getScalarType(), NumElems*SizeRatio);
  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  // Convert Src0 value
  SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
  if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
    for (unsigned i = 0; i != NumElems; ++i)
      ShuffleVec[i] = i * SizeRatio;
    // Can't shuffle using an illegal type.
    assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
    WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
                                    DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
  }
  // Prepare the new mask
  SDValue NewMask;
  SDValue Mask = Mld->getMask();
  if (Mask.getValueType() == VT) {
    // Mask and original value have the same type
    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
    for (unsigned i = 0; i != NumElems; ++i)
      ShuffleVec[i] = i * SizeRatio;
    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
      ShuffleVec[i] = NumElems*SizeRatio;
    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                   DAG.getConstant(0, WideVecVT),
                                   &ShuffleVec[0]);
  }
  else {
    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
    unsigned WidenNumElts = NumElems*SizeRatio;
    unsigned MaskNumElts = VT.getVectorNumElements();
    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
                                     WidenNumElts);
    unsigned NumConcat = WidenNumElts / MaskNumElts;
    SmallVector<SDValue, 16> Ops(NumConcat);
    SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
    Ops[0] = Mask;
    for (unsigned i = 1; i != NumConcat; ++i)
      Ops[i] = ZeroVal;
    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
  }
  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
                                     Mld->getBasePtr(), NewMask, WideSrc0,
                                     Mld->getMemoryVT(), Mld->getMemOperand(),
                                     ISD::NON_EXTLOAD);
  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
  return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
 /// PerformMSTORECombine - Resolve truncating stores
 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
  if (!Mst->isTruncatingStore())
    return SDValue();
  EVT VT = Mst->getValue().getValueType();
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  unsigned NumElems = VT.getVectorNumElements();
  EVT StVT = Mst->getMemoryVT();
  SDLoc dl(Mst);
  assert(StVT != VT && "Cannot truncate to the same type");
  unsigned FromSz = VT.getVectorElementType().getSizeInBits();
  unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
  // From, To sizes and ElemCount must be pow of two
  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
    "Unexpected size for truncating masked store");
  // We are going to use the original vector elt for storing.
  // Accumulated smaller vector elements must be a multiple of the store size.
  assert (((NumElems * FromSz) % ToSz) == 0 && 
          "Unexpected ratio for truncating masked store");
  unsigned SizeRatio  = FromSz / ToSz;
  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
  // Create a type on which we perform the shuffle
  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
          StVT.getScalarType(), NumElems*SizeRatio);
  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
  for (unsigned i = 0; i != NumElems; ++i)
    ShuffleVec[i] = i * SizeRatio;
  // Can't shuffle using an illegal type.
  assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                        DAG.getUNDEF(WideVecVT),
                                        &ShuffleVec[0]);
  SDValue NewMask;
  SDValue Mask = Mst->getMask();
  if (Mask.getValueType() == VT) {
    // Mask and original value have the same type
    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
    for (unsigned i = 0; i != NumElems; ++i)
      ShuffleVec[i] = i * SizeRatio;
    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
      ShuffleVec[i] = NumElems*SizeRatio;
    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                   DAG.getConstant(0, WideVecVT),
                                   &ShuffleVec[0]);
  }
  else {
    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
    unsigned WidenNumElts = NumElems*SizeRatio;
    unsigned MaskNumElts = VT.getVectorNumElements();
    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
                                     WidenNumElts);
    unsigned NumConcat = WidenNumElts / MaskNumElts;
    SmallVector<SDValue, 16> Ops(NumConcat);
    SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
    Ops[0] = Mask;
    for (unsigned i = 1; i != NumConcat; ++i)
      Ops[i] = ZeroVal;
    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
  }
  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
                            NewMask, StVT, Mst->getMemOperand(), false);
 }
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
@ -25836,7 +25998,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
  case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
  case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@ -403,7 +403,7 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start,
                                                      const Instruction& End,
                                                      AliasAnalysis::Location
                                                      Loc) {
-  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Ref);
+  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef);
 }
 ///
@ -414,6 +414,7 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start,
 StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
                                                   StoreInst *Store0) {
  DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
  BasicBlock *BB0 = Store0->getParent();
  for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
       RBI != RBE; ++RBI) {
    Instruction *Inst = &*RBI;
@ -422,13 +423,14 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
       continue;
    StoreInst *Store1 = cast<StoreInst>(Inst);
    BasicBlock *BB0 = Store0->getParent();
    AliasAnalysis::Location Loc0 = AA->getLocation(Store0);
    AliasAnalysis::Location Loc1 = AA->getLocation(Store1);
    if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
-      !isStoreSinkBarrierInRange(*Store1, BB1->back(), Loc1) &&
+      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
-      !isStoreSinkBarrierInRange(*Store0, BB0->back(), Loc0)) {
+                                 BB1->back(), Loc1) &&
      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
                                 BB0->back(), Loc0)) {
      return Store1;
    }
  }
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@ -55,7 +55,7 @@ STATISTIC(NumRuntimeUnrolled,
 /// - Branch around the original loop if the trip count is less
 ///   than the unroll factor.
 ///
-static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
+static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
                          BasicBlock *OrigPH, BasicBlock *NewPH,
                          ValueToValueMapTy &VMap, Pass *P) {
@ -105,12 +105,19 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
    }
  }
-  // Create a branch around the orignal loop, which is taken if the
+  // Create a branch around the orignal loop, which is taken if there are no
-  // trip count is less than the unroll factor.
+  // iterations remaining to be executed after running the prologue.
  Instruction *InsertPt = PrologEnd->getTerminator();
  assert(Count != 0 && "nonsensical Count!");
  // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1)
  // (since Count is a power of 2).  This means %xtraiter is (BECount + 1) and
  // and all of the iterations of this loop were executed by the prologue.  Note
  // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow.
  Instruction *BrLoopExit =
-    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount,
+    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, BECount,
-                 ConstantInt::get(TripCount->getType(), Count));
+                 ConstantInt::get(BECount->getType(), Count - 1));
  BasicBlock *Exit = L->getUniqueExitBlock();
  assert(Exit && "Loop must have a single exit block only");
  // Split the exit to maintain loop canonicalization guarantees
@ -292,23 +299,28 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
  // Only unroll loops with a computable trip count and the trip count needs
  // to be an int value (allowing a pointer type is a TODO item)
-  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+  const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
+  if (isa<SCEVCouldNotCompute>(BECountSC) ||
      !BECountSC->getType()->isIntegerTy())
    return false;
-  // If BECount is INT_MAX, we can't compute trip-count without overflow.
+  unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
  if (BECount->isAllOnesValue())
    return false;
  // Add 1 since the backedge count doesn't include the first loop iteration
  const SCEV *TripCountSC =
-    SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
+    SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
  if (isa<SCEVCouldNotCompute>(TripCountSC))
    return false;
  // We only handle cases when the unroll factor is a power of 2.
  // Count is the loop unroll factor, the number of extra copies added + 1.
-  if ((Count & (Count-1)) != 0)
+  if (!isPowerOf2_32(Count))
    return false;
  // This constraint lets us deal with an overflowing trip count easily; see the
  // comment on ModVal below.  This check is equivalent to `Log2(Count) <
  // BEWidth`.
  if (static_cast<uint64_t>(Count) > (1ULL << BEWidth))
    return false;
  // If this loop is nested, then the loop unroller changes the code in
@ -330,16 +342,23 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
  SCEVExpander Expander(*SE, "loop-unroll");
  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                            PreHeaderBR);
  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
                                          PreHeaderBR);
  IRBuilder<> B(PreHeaderBR);
  Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
-  // Check if for no extra iterations, then jump to cloned/unrolled loop.
+  // If ModVal is zero, we know that either
-  // We have to check that the trip count computation didn't overflow when
+  //  1. there are no iteration to be run in the prologue loop
-  // adding one to the backedge taken count.
+  // OR
-  Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod");
+  //  2. the addition computing TripCount overflowed
-  Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow");
+  //
-  Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or");
+  // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the
  // number of iterations that remain to be run in the original loop is a
  // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
  // explicitly check this above).
  Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
  // Branch to either the extra iterations or the cloned/unrolled loop
  // We will fix up the true branch label when adding loop body copies
@ -362,10 +381,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
  std::vector<BasicBlock *> NewBlocks;
  ValueToValueMapTy VMap;
-  // If unroll count is 2 and we can't overflow in tripcount computation (which
+  bool UnrollPrologue = Count == 2;
  // is BECount + 1), then we don't need a loop for prologue, and we can unroll
  // it. We can be sure that we don't overflow only if tripcount is a constant.
  bool UnrollPrologue = (Count == 2 && isa<ConstantInt>(TripCount));
  // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  // the loop, otherwise we create a cloned loop to execute the extra
@ -391,7 +407,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
  // Connect the prolog code to the original loop and update the
  // PHI functions.
  BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
-  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
+  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
                LPM->getAsPass());
  NumRuntimeUnrolled++;
  return true;
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1874,6 +1874,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
        // wide store needs to start at the last vector element.
        PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
        PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
        Mask[Part] = reverseVector(Mask[Part]);
      }
      Value *VecPtr = Builder.CreateBitCast(PartPtr,
@ -1902,6 +1903,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
      // wide load needs to start at the last vector element.
      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
      Mask[Part] = reverseVector(Mask[Part]);
    }
    Instruction* NewLI;
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@ -159,7 +159,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
 }
 ; AVX2-LABEL: test15
-; AVX2: vpmaskmovq
+; AVX2: vpmaskmovd
 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
  call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
@ -176,8 +176,9 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
 }
 ; AVX2-LABEL: test17
-; AVX2: vpmaskmovq
+; AVX2: vpmaskmovd
-; AVX2: vblendvpd
+; AVX2: vblendvps
 ; AVX2: vpmovsxdq
 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
  %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
--- a/test/Transforms/InstMerge/st_sink_bugfix_22613.ll
+++ b/test/Transforms/InstMerge/st_sink_bugfix_22613.ll
@ -0,0 +1,114 @@
 ; ModuleID = 'bug.c'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 ; RUN: opt -O2 -S < %s | FileCheck %s
 ; CHECK_LABEL: main
 ; CHECK: if.end
 ; CHECK: store
 ; CHECK: memset
 ; CHECK: if.then
 ; CHECK: store
 ; CHECK: memset
@d = common global i32 0, align 4
@b = common global i32 0, align 4
@f = common global [1 x [3 x i8]] zeroinitializer, align 1
@e = common global i32 0, align 4
@c = common global i32 0, align 4
@a = common global i32 0, align 4
 ; Function Attrs: nounwind uwtable
 define void @fn1() #0 {
 entry:
  store i32 0, i32* @d, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc8, %entry
  %0 = load i32* @d, align 4
  %cmp = icmp slt i32 %0, 2
  br i1 %cmp, label %for.body, label %for.end10
 for.body:                                         ; preds = %for.cond
  %1 = load i32* @d, align 4
  %idxprom = sext i32 %1 to i64
  %2 = load i32* @b, align 4
  %idxprom1 = sext i32 %2 to i64
  %arrayidx = getelementptr inbounds [1 x [3 x i8]]* @f, i32 0, i64 %idxprom1
  %arrayidx2 = getelementptr inbounds [3 x i8]* %arrayidx, i32 0, i64 %idxprom
  store i8 0, i8* %arrayidx2, align 1
  store i32 0, i32* @e, align 4
  br label %for.cond3
 for.cond3:                                        ; preds = %for.inc, %for.body
  %3 = load i32* @e, align 4
  %cmp4 = icmp slt i32 %3, 3
  br i1 %cmp4, label %for.body5, label %for.end
 for.body5:                                        ; preds = %for.cond3
  %4 = load i32* @c, align 4
  %tobool = icmp ne i32 %4, 0
  br i1 %tobool, label %if.then, label %if.end
 if.then:                                          ; preds = %for.body5
  %5 = load i32* @a, align 4
  %dec = add nsw i32 %5, -1
  store i32 %dec, i32* @a, align 4
  br label %if.end
 if.end:                                           ; preds = %if.then, %for.body5
  %6 = load i32* @e, align 4
  %idxprom6 = sext i32 %6 to i64
  %arrayidx7 = getelementptr inbounds [3 x i8]* getelementptr inbounds ([1 x [3 x i8]]* @f, i32 0, i64 0), i32 0, i64 %idxprom6
  store i8 1, i8* %arrayidx7, align 1
  br label %for.inc
 for.inc:                                          ; preds = %if.end
  %7 = load i32* @e, align 4
  %inc = add nsw i32 %7, 1
  store i32 %inc, i32* @e, align 4
  br label %for.cond3
 for.end:                                          ; preds = %for.cond3
  br label %for.inc8
 for.inc8:                                         ; preds = %for.end
  %8 = load i32* @d, align 4
  %inc9 = add nsw i32 %8, 1
  store i32 %inc9, i32* @d, align 4
  br label %for.cond
 for.end10:                                        ; preds = %for.cond
  ret void
 }
 ; Function Attrs: nounwind uwtable
 define i32 @main() #0 {
 entry:
  %retval = alloca i32, align 4
  store i32 0, i32* %retval
  call void @fn1()
  %0 = load i8* getelementptr inbounds ([1 x [3 x i8]]* @f, i32 0, i64 0, i64 1), align 1
  %conv = sext i8 %0 to i32
  %cmp = icmp ne i32 %conv, 1
  br i1 %cmp, label %if.then, label %if.end
 if.then:                                          ; preds = %entry
  call void @abort() #2
  unreachable
 if.end:                                           ; preds = %entry
  ret i32 0
 }
 ; Function Attrs: noreturn nounwind
 declare void @abort() #1
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { noreturn nounwind }
 !llvm.ident = !{!0}
 !0 = !{!"clang version 3.7.0 (trunk 229288) (llvm/trunk 229286:229290M)"}
--- a/test/Transforms/InstSimplify/load.ll
+++ b/test/Transforms/InstSimplify/load.ll
@ -0,0 +1,19 @@
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
@zeroinit = constant {} zeroinitializer
@undef = constant {} undef
 define i32 @crash_on_zeroinit() {
 ; CHECK-LABEL: @crash_on_zeroinit
 ; CHECK: ret i32 0
  %load = load i32* bitcast ({}* @zeroinit to i32*)
  ret i32 %load
 }
 define i32 @crash_on_undef() {
 ; CHECK-LABEL: @crash_on_undef
 ; CHECK: ret i32 undef
  %load = load i32* bitcast ({}* @undef to i32*)
  ret i32 %load
 }
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@ -4,9 +4,7 @@
 ; CHECK: %xtraiter = and i32 %n
 ; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
-; CHECK:  %lcmp.overflow = icmp eq i32 %n, 0
+; CHECK:  br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
 ; CHECK:  %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod
 ; CHECK:  br i1 %lcmp.or, label %for.body.prol, label %for.body.preheader.split
 ; CHECK: for.body.prol:
 ; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ]
--- a/test/Transforms/LoopUnroll/runtime-loop1.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop1.ll
@ -3,7 +3,7 @@
 ; This tests that setting the unroll count works
 ; CHECK: for.body.prol:
-; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
+; CHECK: br label %for.body.preheader.split
 ; CHECK: for.body:
 ; CHECK: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body
 ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
--- a/test/Transforms/LoopUnroll/tripcount-overflow.ll
+++ b/test/Transforms/LoopUnroll/tripcount-overflow.ll
@ -1,19 +1,28 @@
 ; RUN: opt < %s -S -unroll-runtime -unroll-count=2 -loop-unroll | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-; When prologue is fully unrolled, the branch on its end is unconditional.
+; This test case documents how runtime loop unrolling handles the case
-; Unrolling it is illegal if we can't prove that trip-count+1 doesn't overflow,
+; when the backedge-count is -1.
-; like in this example, where it comes from an argument.
+
-;
+; If %N, the backedge-taken count, is -1 then %0 unsigned-overflows
-; This test is based on an example from here:
+; and is 0.  %xtraiter too is 0, signifying that the total trip-count
-; http://stackoverflow.com/questions/23838661/why-is-clang-optimizing-this-code-out
+; is divisible by 2.  The prologue then branches to the unrolled loop
-;
+; and executes the 2^32 iterations there, in groups of 2.
 ; CHECK: entry:
 ; CHECK-NEXT: %0 = add i32 %N, 1
 ; CHECK-NEXT: %xtraiter = and i32 %0, 1
 ; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; CHECK-NEXT: br i1 %lcmp.mod, label %while.body.prol, label %entry.split
 ; CHECK: while.body.prol:
-; CHECK: br i1
+; CHECK: br label %entry.split
 ; CHECK: entry.split:
 ; Function Attrs: nounwind readnone ssp uwtable
-define i32 @foo(i32 %N) #0 {
+define i32 @foo(i32 %N) {
 entry:
  br label %while.body
@ -26,5 +35,3 @@ while.body:                                       ; preds = %while.body, %entry
 while.end:                                        ; preds = %while.body
  ret i32 %i
 }
 attributes #0 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@ -418,3 +418,85 @@ for.end:                                          ; preds = %for.cond
  ret void
 }
 ; Reverse loop
 ;void foo6(double *in, double *out, unsigned size, int *trigger) {
 ;
 ;  for (int i=SIZE-1; i>=0; i--) {
 ;    if (trigger[i] > 0) {
 ;      out[i] = in[i] + (double) 0.5;
 ;    }
 ;  }
 ;}
 ;AVX2-LABEL: @foo6
 ;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
 ;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ;AVX2: call <4 x double> @llvm.masked.load.v4f64
 ;AVX2: fadd <4 x double>
 ;AVX2: call void @llvm.masked.store.v4f64
 ;AVX2: ret void
 ;AVX512-LABEL: @foo6
 ;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
 ;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
 ;AVX512: call <8 x double> @llvm.masked.load.v8f64
 ;AVX512: fadd <8 x double>
 ;AVX512: call void @llvm.masked.store.v8f64
 ;AVX512: ret void
 define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
 entry:
  %in.addr = alloca double*, align 8
  %out.addr = alloca double*, align 8
  %size.addr = alloca i32, align 4
  %trigger.addr = alloca i32*, align 8
  %i = alloca i32, align 4
  store double* %in, double** %in.addr, align 8
  store double* %out, double** %out.addr, align 8
  store i32 %size, i32* %size.addr, align 4
  store i32* %trigger, i32** %trigger.addr, align 8
  store i32 4095, i32* %i, align 4
  br label %for.cond
 for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32* %i, align 4
  %cmp = icmp sge i32 %0, 0
  br i1 %cmp, label %for.body, label %for.end
 for.body:                                         ; preds = %for.cond
  %1 = load i32* %i, align 4
  %idxprom = sext i32 %1 to i64
  %2 = load i32** %trigger.addr, align 8
  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
  %3 = load i32* %arrayidx, align 4
  %cmp1 = icmp sgt i32 %3, 0
  br i1 %cmp1, label %if.then, label %if.end
 if.then:                                          ; preds = %for.body
  %4 = load i32* %i, align 4
  %idxprom2 = sext i32 %4 to i64
  %5 = load double** %in.addr, align 8
  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
  %6 = load double* %arrayidx3, align 8
  %add = fadd double %6, 5.000000e-01
  %7 = load i32* %i, align 4
  %idxprom4 = sext i32 %7 to i64
  %8 = load double** %out.addr, align 8
  %arrayidx5 = getelementptr inbounds double* %8, i64 %idxprom4
  store double %add, double* %arrayidx5, align 8
  br label %if.end
 if.end:                                           ; preds = %if.then, %for.body
  br label %for.inc
 for.inc:                                          ; preds = %if.end
  %9 = load i32* %i, align 4
  %dec = add nsw i32 %9, -1
  store i32 %dec, i32* %i, align 4
  br label %for.cond
 for.end:                                          ; preds = %for.cond
  ret void
 }
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
  MC
  MCJIT
  Object
  RuntimeDyld
  SelectionDAG
  Support
  native
--- a/unittests/ExecutionEngine/CMakeLists.txt
+++ b/unittests/ExecutionEngine/CMakeLists.txt
@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
  ExecutionEngine
  Interpreter
  MC
  RuntimeDyld
  Support
  )
--- a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
  IPO
  MC
  MCJIT
  RuntimeDyld
  ScalarOpts
  Support
  Target