Vendor import of llvm RELEASE_360/rc4 tag r229772 (effectively, 3.6.0 RC4):

https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/rc4@229772
2015-02-19 20:55:17 +00:00 · 2015-02-19 20:55:17 +00:00 · 49b6407b6c
commit 49b6407b6c
parent 85d2764eab
36 changed files with 643 additions and 123 deletions
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -5,11 +5,6 @@ LLVM 3.6 Release Notes
 .. contents::
    :local:

-.. warning::
-   These are in-progress notes for the upcoming LLVM 3.6 release.  You may
-   prefer the `LLVM 3.5 Release Notes <http://llvm.org/releases/3.5.0/docs
-   /ReleaseNotes.html>`_.
-

 Introduction
 ============
@ -26,10 +21,6 @@ have questions or comments, the `LLVM Developer's Mailing List
 <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ is a good place to send
 them.

-Note that if you are reading this file from a Subversion checkout or the main
-LLVM web page, this document applies to the *next* release, not the current
-one.  To see the release notes for a specific release, please see the `releases
-page <http://llvm.org/releases/>`_.

 Non-comprehensive list of changes in this release
 =================================================
@ -544,6 +535,33 @@ new LLVM-based code generators "on the fly" for the designed processors and
 loads them in to the compiler backend as runtime libraries to avoid
 per-target recompilation of larger parts of the compiler chain. 

+Likely
+------
+
+`Likely <http://www.liblikely.org>`_ is an embeddable just-in-time Lisp for
+image recognition and heterogenous computing. Algorithms are just-in-time
+compiled using LLVM's MCJIT infrastructure to execute on single or
+multi-threaded CPUs and potentially OpenCL SPIR or CUDA enabled GPUs.
+Likely seeks to explore new optimizations for statistical learning 
+algorithms by moving them from an offline model generation step to the 
+compile-time evaluation of a function (the learning algorithm) with constant
+arguments (the training data).
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on
+PowerPC (32/64 bit). Ports to other architectures like ARM, AArch64 and MIPS64
+are underway.
+
 Additional Information
 ======================

--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,11 +1,6 @@
 Overview
 ========

-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
-
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
--- a/examples/Kaleidoscope/Chapter4/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter4/CMakeLists.txt
@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
  ExecutionEngine
  InstCombine
  MC
+  RuntimeDyld
  ScalarOpts
  Support
  native
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@ -867,9 +867,11 @@ class SelectionDAG {
                           SDValue Offset, ISD::MemIndexedMode AM);

  SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
-                        SDValue Mask, SDValue Src0, MachineMemOperand *MMO);
+                        SDValue Mask, SDValue Src0, EVT MemVT,
+                        MachineMemOperand *MMO, ISD::LoadExtType);
  SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
-                         SDValue Ptr, SDValue Mask, MachineMemOperand *MMO);
+                         SDValue Ptr, SDValue Mask, EVT MemVT,
+                         MachineMemOperand *MMO, bool IsTrunc);
  /// getSrcValue - Construct a node to track a Value* through the backend.
  SDValue getSrcValue(const Value *v);

--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@ -1970,13 +1970,17 @@ class MaskedLoadStoreSDNode : public MemSDNode {
 class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
 public:
  friend class SelectionDAG;
-  MaskedLoadSDNode(unsigned Order, DebugLoc dl,
-                   SDValue *Operands, unsigned numOperands, 
-                   SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+  MaskedLoadSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
+                   unsigned numOperands, SDVTList VTs, ISD::LoadExtType ETy,
+                   EVT MemVT, MachineMemOperand *MMO)
    : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands,
-                            VTs, MemVT, MMO) 
-  {}
+                            VTs, MemVT, MMO) {
+    SubclassData |= (unsigned short)ETy;
+  }

+  ISD::LoadExtType getExtensionType() const {
+    return ISD::LoadExtType(SubclassData & 3);
+  } 
  const SDValue &getSrc0() const { return getOperand(3); }
  static bool classof(const SDNode *N) {
    return N->getOpcode() == ISD::MLOAD;
@ -1989,14 +1993,19 @@ class MaskedStoreSDNode : public MaskedLoadStoreSDNode {

 public:
  friend class SelectionDAG;
-  MaskedStoreSDNode(unsigned Order, DebugLoc dl,
-                   SDValue *Operands, unsigned numOperands, 
-                   SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+  MaskedStoreSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
+                    unsigned numOperands, SDVTList VTs, bool isTrunc, EVT MemVT,
+                    MachineMemOperand *MMO)
    : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands,
-                            VTs, MemVT, MMO) 
-  {}
+                            VTs, MemVT, MMO) {
+      SubclassData |= (unsigned short)isTrunc;
+  }
+  /// isTruncatingStore - Return true if the op does a truncation before store.
+  /// For integers this is the same as doing a TRUNCATE and storing the result.
+  /// For floats, it is the same as doing an FP_ROUND and storing the result.
+  bool isTruncatingStore() const { return SubclassData & 1; }

-  const SDValue &getData() const { return getOperand(3); }
+  const SDValue &getValue() const { return getOperand(3); }

  static bool classof(const SDNode *N) {
    return N->getOpcode() == ISD::MSTORE;
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@ -6,9 +6,6 @@
 /* Exported configuration */
 #include "llvm/Config/llvm-config.h"

-/* Patch version of the LLVM API */
-#cmakedefine LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}
-
 /* Bug report URL. */
 #define BUG_REPORT_URL "${BUG_REPORT_URL}"

--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@ -87,10 +87,13 @@
 #cmakedefine LLVM_USE_OPROFILE 1

 /* Major version of the LLVM API */
-#cmakedefine LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
+#define LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}

 /* Minor version of the LLVM API */
-#cmakedefine LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
+#define LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
+
+/* Patch version of the LLVM API */
+#define LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}

 /* LLVM version string */
 #define LLVM_VERSION_STRING "${PACKAGE_VERSION}"
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in
@ -92,6 +92,9 @@
 /* Minor version of the LLVM API */
 #undef LLVM_VERSION_MINOR

+/* Patch version of the LLVM API */
+#undef LLVM_VERSION_PATCH
+
 /* LLVM version string */
 #undef LLVM_VERSION_STRING

--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@ -325,6 +325,9 @@ class ConstantAggregateZero : public Constant {
  /// index.
  Constant *getElementValue(unsigned Idx) const;

+  /// \brief Return the number of elements in the array, vector, or struct.
+  unsigned getNumElements() const;
+
  /// Methods for support type inquiry through isa, cast, and dyn_cast:
  ///
  static bool classof(const Value *V) {
@ -1196,6 +1199,9 @@ class UndefValue : public Constant {
  /// index.
  UndefValue *getElementValue(unsigned Idx) const;

+  /// \brief Return the number of elements in the array, vector, or struct.
+  unsigned getNumElements() const;
+
  void destroyConstant() override;

  /// Methods for support type inquiry through isa, cast, and dyn_cast:
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@ -538,9 +538,17 @@ Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
  if (Metadata *MD = MDValuePtrs[Idx])
    return MD;

-  // Create and return a placeholder, which will later be RAUW'd.
-  AnyFwdRefs = true;
+  // Track forward refs to be resolved later.
+  if (AnyFwdRefs) {
+    MinFwdRef = std::min(MinFwdRef, Idx);
+    MaxFwdRef = std::max(MaxFwdRef, Idx);
+  } else {
+    AnyFwdRefs = true;
+    MinFwdRef = MaxFwdRef = Idx;
+  }
  ++NumFwdRefs;
+
+  // Create and return a placeholder, which will later be RAUW'd.
  Metadata *MD = MDNode::getTemporary(Context, None);
  MDValuePtrs[Idx].reset(MD);
  return MD;
@ -556,11 +564,15 @@ void BitcodeReaderMDValueList::tryToResolveCycles() {
    return;

  // Resolve any cycles.
-  for (auto &MD : MDValuePtrs) {
+  for (unsigned I = MinFwdRef, E = MaxFwdRef + 1; I != E; ++I) {
+    auto &MD = MDValuePtrs[I];
    assert(!(MD && isa<MDNodeFwdDecl>(MD)) && "Unexpected forward reference");
    if (auto *N = dyn_cast_or_null<UniquableMDNode>(MD))
      N->resolveCycles();
  }
+
+  // Make sure we return early again until there's another forward ref.
+  AnyFwdRefs = false;
 }

 Type *BitcodeReader::getTypeByID(unsigned ID) {
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@ -99,6 +99,8 @@ class BitcodeReaderValueList {
 class BitcodeReaderMDValueList {
  unsigned NumFwdRefs;
  bool AnyFwdRefs;
+  unsigned MinFwdRef;
+  unsigned MaxFwdRef;
  std::vector<TrackingMDRef> MDValuePtrs;

  LLVMContext &Context;
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -4842,7 +4842,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {

  MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
  SDValue Mask = MST->getMask();
-  SDValue Data  = MST->getData();
+  SDValue Data  = MST->getValue();
  SDLoc DL(N);

  // If the MSTORE data type requires splitting and the mask is provided by a
@ -4885,7 +4885,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
                           Alignment, MST->getAAInfo(), MST->getRanges());

-    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO);
+    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                            MST->isTruncatingStore());

    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@ -4897,7 +4898,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
                           SecondHalfAlignment, MST->getAAInfo(),
                           MST->getRanges());

-    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO);
+    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                            MST->isTruncatingStore());

    AddToWorklist(Lo.getNode());
    AddToWorklist(Hi.getNode());
@ -4958,7 +4960,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                         Alignment, MLD->getAAInfo(), MLD->getRanges());

-    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO);
+    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                           ISD::NON_EXTLOAD);

    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@ -4969,7 +4972,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());

-    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO);
+    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                           ISD::NON_EXTLOAD);

    AddToWorklist(Lo.getNode());
    AddToWorklist(Hi.getNode());
@ -9482,6 +9486,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
    unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    // The narrowing should be profitable, the load/store operation should be
+    // legal (or custom) and the store size should be equal to the NewVT width.
    while (NewBW < BitWidth &&
           !(TLI.isOperationLegalOrCustom(Opc, NewVT) &&
             TLI.isNarrowingProfitable(VT, NewVT))) {
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -458,16 +458,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
  SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
-  SDValue ExtMask = PromoteTargetBoolean(N->getMask(), NVT);
+
+  SDValue Mask = N->getMask();
+  EVT NewMaskVT = getSetCCResultType(NVT);
+  if (NewMaskVT != N->getMask().getValueType())
+    Mask = PromoteTargetBoolean(Mask, NewMaskVT);
  SDLoc dl(N);

-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  NVT.getStoreSize(),
-                         N->getAlignment(), N->getAAInfo(), N->getRanges());
-
  SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
-                                  ExtMask, ExtSrc0, MMO);
+                                  Mask, ExtSrc0, N->getMemoryVT(),
+                                  N->getMemOperand(), ISD::SEXTLOAD);
  // Legalized the chain result - switch anything that used the old chain to
  // use the new one.
  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@ -1117,16 +1117,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){

  assert(OpNo == 2 && "Only know how to promote the mask!");
-  SDValue DataOp = N->getData();
+  SDValue DataOp = N->getValue();
  EVT DataVT = DataOp.getValueType();
  SDValue Mask = N->getMask();
  EVT MaskVT = Mask.getValueType();
  SDLoc dl(N);

+  bool TruncateStore = false;
  if (!TLI.isTypeLegal(DataVT)) {
    if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
      DataOp = GetPromotedInteger(DataOp);
      Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
+      TruncateStore = true;
    }
    else {
      assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
@ -1156,10 +1158,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpN
  }
  else
    Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
-  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
-  NewOps[2] = Mask;
-  NewOps[3] = DataOp;
-  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
+                            N->getMemoryVT(), N->getMemOperand(),
+                            TruncateStore);
 }

 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -659,6 +659,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
  SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
  SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
  SDValue WidenVecOp_STORE(SDNode* N);
+  SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
  SDValue WidenVecOp_SETCC(SDNode* N);

  SDValue WidenVecOp_Convert(SDNode *N);
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -992,6 +992,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
  SDValue Ptr = MLD->getBasePtr();
  SDValue Mask = MLD->getMask();
  unsigned Alignment = MLD->getOriginalAlignment();
+  ISD::LoadExtType ExtType = MLD->getExtensionType();

  // if Alignment is equal to the vector size,
  // take the half of it for the second part
@ -1015,7 +1016,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                         Alignment, MLD->getAAInfo(), MLD->getRanges());

-  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, MMO);
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                         ExtType);

  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@ -1026,7 +1028,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());

-  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, MMO);
+  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                         ExtType);


  // Build a factor node to remember that this load is independent of the
@ -1464,7 +1467,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
  SDValue Ch  = N->getChain();
  SDValue Ptr = N->getBasePtr();
  SDValue Mask = N->getMask();
-  SDValue Data = N->getData();
+  SDValue Data = N->getValue();
  EVT MemoryVT = N->getMemoryVT();
  unsigned Alignment = N->getOriginalAlignment();
  SDLoc DL(N);
@ -1489,7 +1492,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                         Alignment, N->getAAInfo(), N->getRanges());

-  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, MMO);
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                          N->isTruncatingStore());

  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@ -1500,7 +1504,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
                         SecondHalfAlignment, N->getAAInfo(), N->getRanges());

-  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, MMO);
+  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                          N->isTruncatingStore());


  // Build a factor node to remember that this store is independent of the
@ -2412,6 +2417,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
  SDValue Mask = N->getMask();
  EVT MaskVT = Mask.getValueType();
  SDValue Src0 = GetWidenedVector(N->getSrc0());
+  ISD::LoadExtType ExtType = N->getExtensionType();
  SDLoc dl(N);

  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
@ -2434,14 +2440,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
  }

-  // Rebuild memory operand because MemoryVT was changed
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  WidenVT.getStoreSize(),
-                         N->getAlignment(), N->getAAInfo(), N->getRanges());
-
  SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
-                                  Mask, Src0, MMO);
+                                  Mask, Src0, N->getMemoryVT(),
+                                  N->getMemOperand(), ExtType);
  // Legalized the chain result - switch anything that used the old chain to
  // use the new one.
  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@ -2593,6 +2594,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
  case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
  case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
  case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+  case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
  case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;

  case ISD::ANY_EXTEND:
@ -2791,6 +2793,42 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
    return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
 }

+SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
+  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue StVal = MST->getValue();
+  // Widen the value
+  SDValue WideVal = GetWidenedVector(StVal);
+  SDLoc dl(N);
+
+  if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    // The mask should be widened as well
+    EVT BoolVT = getSetCCResultType(WideVal.getValueType());
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+  assert(Mask.getValueType().getVectorNumElements() ==
+         WideVal.getValueType().getVectorNumElements() &&
+         "Mask and data vectors should have the same number of elements");
+  return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
+                            Mask, MST->getMemoryVT(), MST->getMemOperand(),
+                            false);
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
  SDValue InOp0 = GetWidenedVector(N->getOperand(0));
  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -4924,15 +4924,15 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,

 SDValue
 SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
-                            SDValue Ptr, SDValue Mask, SDValue Src0,
-                            MachineMemOperand *MMO) {
+                            SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT,
+                            MachineMemOperand *MMO, ISD::LoadExtType ExtTy) {

  SDVTList VTs = getVTList(VT, MVT::Other);
  SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
  FoldingSetNodeID ID;
  AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
  ID.AddInteger(VT.getRawBits());
-  ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED,
+  ID.AddInteger(encodeMemSDNodeFlags(ExtTy, ISD::UNINDEXED,
                                     MMO->isVolatile(),
                                     MMO->isNonTemporal(),
                                     MMO->isInvariant()));
@ -4944,14 +4944,15 @@ SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
  }
  SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
                                             dl.getDebugLoc(), Ops, 4, VTs,
-                                             VT, MMO);
+                                             ExtTy, MemVT, MMO);
  CSEMap.InsertNode(N, IP);
  InsertNode(N);
  return SDValue(N, 0);
 }

 SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
-                               SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) {
+                                     SDValue Ptr, SDValue Mask, EVT MemVT,
+                                     MachineMemOperand *MMO, bool isTrunc) {
  assert(Chain.getValueType() == MVT::Other &&
        "Invalid chain type");
  EVT VT = Val.getValueType();
@ -4970,7 +4971,7 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
  }
  SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
                                                    dl.getDebugLoc(), Ops, 4,
-                                                    VTs, VT, MMO);
+                                                    VTs, isTrunc, MemVT, MMO);
  CSEMap.InsertNode(N, IP);
  InsertNode(N);
  return SDValue(N, 0);
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -3667,7 +3667,8 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
    getMachineMemOperand(MachinePointerInfo(PtrOperand),
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
                          Alignment, AAInfo);
-  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO);
+  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
+                                         MMO, false);
  DAG.setRoot(StoreNode);
  setValue(&I, StoreNode);
 }
@ -3706,7 +3707,8 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
                          MachineMemOperand::MOLoad,  VT.getStoreSize(),
                          Alignment, AAInfo, Ranges);

-  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO);
+  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
+                                   ISD::NON_EXTLOAD);
  SDValue OutChain = Load.getValue(1);
  DAG.setRoot(OutChain);
  setValue(&I, Load);
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@ -4,7 +4,6 @@ add_llvm_library(LLVMExecutionEngine
  ExecutionEngine.cpp
  ExecutionEngineBindings.cpp
  GDBRegistrationListener.cpp
-  RTDyldMemoryManager.cpp
  TargetSelect.cpp
  )

--- a/lib/ExecutionEngine/LLVMBuild.txt
+++ b/lib/ExecutionEngine/LLVMBuild.txt
@ -22,4 +22,4 @@ subdirectories = Interpreter MCJIT RuntimeDyld IntelJITEvents OProfileJIT
 type = Library
 name = ExecutionEngine
 parent = Libraries
-required_libraries = Core MC Object Support
+required_libraries = Core MC Object Support RuntimeDyld
--- a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
+++ b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
@ -1,4 +1,5 @@
 add_llvm_library(LLVMRuntimeDyld
+  RTDyldMemoryManager.cpp
  RuntimeDyld.cpp
  RuntimeDyldChecker.cpp
  RuntimeDyldELF.cpp
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@ -257,11 +257,11 @@ Constant *Constant::getAggregateElement(unsigned Elt) const {
  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
    return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : nullptr;

-  if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this))
-    return CAZ->getElementValue(Elt);
+  if (const ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(this))
+    return Elt < CAZ->getNumElements() ? CAZ->getElementValue(Elt) : nullptr;

  if (const UndefValue *UV = dyn_cast<UndefValue>(this))
-    return UV->getElementValue(Elt);
+    return Elt < UV->getNumElements() ? UV->getElementValue(Elt) : nullptr;

  if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
    return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt)
@ -764,6 +764,14 @@ Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
  return getStructElement(Idx);
 }

+unsigned ConstantAggregateZero::getNumElements() const {
+  const Type *Ty = getType();
+  if (const auto *AT = dyn_cast<ArrayType>(Ty))
+    return AT->getNumElements();
+  if (const auto *VT = dyn_cast<VectorType>(Ty))
+    return VT->getNumElements();
+  return Ty->getStructNumElements();
+}

 //===----------------------------------------------------------------------===//
 //                         UndefValue Implementation
@ -797,7 +805,14 @@ UndefValue *UndefValue::getElementValue(unsigned Idx) const {
  return getStructElement(Idx);
 }

-
+unsigned UndefValue::getNumElements() const {
+  const Type *Ty = getType();
+  if (const auto *AT = dyn_cast<ArrayType>(Ty))
+    return AT->getNumElements();
+  if (const auto *VT = dyn_cast<VectorType>(Ty))
+    return VT->getNumElements();
+  return Ty->getStructNumElements();
+}

 //===----------------------------------------------------------------------===//
 //                            ConstantXXX Classes
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -1679,7 +1679,9 @@ void X86TargetLowering::resetOperationActions() {
  setTargetDAGCombine(ISD::FMA);
  setTargetDAGCombine(ISD::SUB);
  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::MLOAD);
  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::MSTORE);
  setTargetDAGCombine(ISD::ZERO_EXTEND);
  setTargetDAGCombine(ISD::ANY_EXTEND);
  setTargetDAGCombine(ISD::SIGN_EXTEND);
@ -24738,6 +24740,166 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }

+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+  if (Mld->getExtensionType() != ISD::SEXTLOAD)
+    return SDValue();
+
+  EVT VT = Mld->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT LdVT = Mld->getMemoryVT();
+  SDLoc dl(Mld);
+
+  assert(LdVT != VT && "Cannot extend to the same type");
+  unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+  unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+  // From, To sizes and ElemCount must be pow of two
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for extending masked load");
+
+  unsigned SizeRatio  = ToSz / FromSz;
+  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          LdVT.getScalarType(), NumElems*SizeRatio);
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  // Convert Src0 value
+  SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+  if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
+    WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+                                    DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+  }
+  // Prepare the new mask
+  SDValue NewMask;
+  SDValue Mask = Mld->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type
+    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+      ShuffleVec[i] = NumElems*SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, WideVecVT),
+                                   &ShuffleVec[0]);
+  }
+  else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+  
+  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+                                     Mld->getBasePtr(), NewMask, WideSrc0,
+                                     Mld->getMemoryVT(), Mld->getMemOperand(),
+                                     ISD::NON_EXTLOAD);
+  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+  return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                    const X86Subtarget *Subtarget) {
+  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+  if (!Mst->isTruncatingStore())
+    return SDValue();
+
+  EVT VT = Mst->getValue().getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT StVT = Mst->getMemoryVT();
+  SDLoc dl(Mst);
+
+  assert(StVT != VT && "Cannot truncate to the same type");
+  unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+  unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+  // From, To sizes and ElemCount must be pow of two
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for truncating masked store");
+  // We are going to use the original vector elt for storing.
+  // Accumulated smaller vector elements must be a multiple of the store size.
+  assert (((NumElems * FromSz) % ToSz) == 0 && 
+          "Unexpected ratio for truncating masked store");
+
+  unsigned SizeRatio  = FromSz / ToSz;
+  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          StVT.getScalarType(), NumElems*SizeRatio);
+
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i] = i * SizeRatio;
+
+  // Can't shuffle using an illegal type.
+  assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
+
+  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                        DAG.getUNDEF(WideVecVT),
+                                        &ShuffleVec[0]);
+
+  SDValue NewMask;
+  SDValue Mask = Mst->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type
+    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+      ShuffleVec[i] = NumElems*SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, WideVecVT),
+                                   &ShuffleVec[0]);
+  }
+  else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
+                            NewMask, StVT, Mst->getMemOperand(), false);
+}
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
@ -25836,7 +25998,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
+  case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@ -403,7 +403,7 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start,
                                                      const Instruction& End,
                                                      AliasAnalysis::Location
                                                      Loc) {
-  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Ref);
+  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef);
 }

 ///
@ -414,6 +414,7 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start,
 StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
                                                   StoreInst *Store0) {
  DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+  BasicBlock *BB0 = Store0->getParent();
  for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
       RBI != RBE; ++RBI) {
    Instruction *Inst = &*RBI;
@ -422,13 +423,14 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
       continue;

    StoreInst *Store1 = cast<StoreInst>(Inst);
-    BasicBlock *BB0 = Store0->getParent();

    AliasAnalysis::Location Loc0 = AA->getLocation(Store0);
    AliasAnalysis::Location Loc1 = AA->getLocation(Store1);
    if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
-      !isStoreSinkBarrierInRange(*Store1, BB1->back(), Loc1) &&
-      !isStoreSinkBarrierInRange(*Store0, BB0->back(), Loc0)) {
+      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
+                                 BB1->back(), Loc1) &&
+      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
+                                 BB0->back(), Loc0)) {
      return Store1;
    }
  }
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@ -55,7 +55,7 @@ STATISTIC(NumRuntimeUnrolled,
 /// - Branch around the original loop if the trip count is less
 ///   than the unroll factor.
 ///
-static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
+static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
                          BasicBlock *OrigPH, BasicBlock *NewPH,
                          ValueToValueMapTy &VMap, Pass *P) {
@ -105,12 +105,19 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
    }
  }

-  // Create a branch around the orignal loop, which is taken if the
-  // trip count is less than the unroll factor.
+  // Create a branch around the orignal loop, which is taken if there are no
+  // iterations remaining to be executed after running the prologue.
  Instruction *InsertPt = PrologEnd->getTerminator();
+
+  assert(Count != 0 && "nonsensical Count!");
+
+  // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1)
+  // (since Count is a power of 2).  This means %xtraiter is (BECount + 1) and
+  // and all of the iterations of this loop were executed by the prologue.  Note
+  // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow.
  Instruction *BrLoopExit =
-    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount,
-                 ConstantInt::get(TripCount->getType(), Count));
+    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, BECount,
+                 ConstantInt::get(BECount->getType(), Count - 1));
  BasicBlock *Exit = L->getUniqueExitBlock();
  assert(Exit && "Loop must have a single exit block only");
  // Split the exit to maintain loop canonicalization guarantees
@ -292,23 +299,28 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,

  // Only unroll loops with a computable trip count and the trip count needs
  // to be an int value (allowing a pointer type is a TODO item)
-  const SCEV *BECount = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
+  const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BECountSC) ||
+      !BECountSC->getType()->isIntegerTy())
    return false;

-  // If BECount is INT_MAX, we can't compute trip-count without overflow.
-  if (BECount->isAllOnesValue())
-    return false;
+  unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();

  // Add 1 since the backedge count doesn't include the first loop iteration
  const SCEV *TripCountSC =
-    SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
+    SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
  if (isa<SCEVCouldNotCompute>(TripCountSC))
    return false;

  // We only handle cases when the unroll factor is a power of 2.
  // Count is the loop unroll factor, the number of extra copies added + 1.
-  if ((Count & (Count-1)) != 0)
+  if (!isPowerOf2_32(Count))
+    return false;
+
+  // This constraint lets us deal with an overflowing trip count easily; see the
+  // comment on ModVal below.  This check is equivalent to `Log2(Count) <
+  // BEWidth`.
+  if (static_cast<uint64_t>(Count) > (1ULL << BEWidth))
    return false;

  // If this loop is nested, then the loop unroller changes the code in
@ -330,16 +342,23 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
  SCEVExpander Expander(*SE, "loop-unroll");
  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                            PreHeaderBR);
+  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
+                                          PreHeaderBR);

  IRBuilder<> B(PreHeaderBR);
  Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");

-  // Check if for no extra iterations, then jump to cloned/unrolled loop.
-  // We have to check that the trip count computation didn't overflow when
-  // adding one to the backedge taken count.
-  Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod");
-  Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow");
-  Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or");
+  // If ModVal is zero, we know that either
+  //  1. there are no iteration to be run in the prologue loop
+  // OR
+  //  2. the addition computing TripCount overflowed
+  //
+  // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the
+  // number of iterations that remain to be run in the original loop is a
+  // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+  // explicitly check this above).
+
+  Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");

  // Branch to either the extra iterations or the cloned/unrolled loop
  // We will fix up the true branch label when adding loop body copies
@ -362,10 +381,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
  std::vector<BasicBlock *> NewBlocks;
  ValueToValueMapTy VMap;

-  // If unroll count is 2 and we can't overflow in tripcount computation (which
-  // is BECount + 1), then we don't need a loop for prologue, and we can unroll
-  // it. We can be sure that we don't overflow only if tripcount is a constant.
-  bool UnrollPrologue = (Count == 2 && isa<ConstantInt>(TripCount));
+  bool UnrollPrologue = Count == 2;

  // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  // the loop, otherwise we create a cloned loop to execute the extra
@ -391,7 +407,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
  // Connect the prolog code to the original loop and update the
  // PHI functions.
  BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
-  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
+  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
                LPM->getAsPass());
  NumRuntimeUnrolled++;
  return true;
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1874,6 +1874,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
        // wide store needs to start at the last vector element.
        PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
        PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+        Mask[Part] = reverseVector(Mask[Part]);
      }

      Value *VecPtr = Builder.CreateBitCast(PartPtr,
@ -1902,6 +1903,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
      // wide load needs to start at the last vector element.
      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+      Mask[Part] = reverseVector(Mask[Part]);
    }

    Instruction* NewLI;
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@ -159,7 +159,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
 }

 ; AVX2-LABEL: test15
-; AVX2: vpmaskmovq
+; AVX2: vpmaskmovd
 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
  call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
@ -176,8 +176,9 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
 }

 ; AVX2-LABEL: test17
-; AVX2: vpmaskmovq
-; AVX2: vblendvpd
+; AVX2: vpmaskmovd
+; AVX2: vblendvps
+; AVX2: vpmovsxdq
 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
  %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
--- a/test/Transforms/InstMerge/st_sink_bugfix_22613.ll
+++ b/test/Transforms/InstMerge/st_sink_bugfix_22613.ll
@ -0,0 +1,114 @@
+; ModuleID = 'bug.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+; CHECK_LABEL: main
+; CHECK: if.end
+; CHECK: store
+; CHECK: memset
+; CHECK: if.then
+; CHECK: store
+; CHECK: memset
+
+@d = common global i32 0, align 4
+@b = common global i32 0, align 4
+@f = common global [1 x [3 x i8]] zeroinitializer, align 1
+@e = common global i32 0, align 4
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define void @fn1() #0 {
+entry:
+  store i32 0, i32* @d, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc8, %entry
+  %0 = load i32* @d, align 4
+  %cmp = icmp slt i32 %0, 2
+  br i1 %cmp, label %for.body, label %for.end10
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @d, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32* @b, align 4
+  %idxprom1 = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x [3 x i8]]* @f, i32 0, i64 %idxprom1
+  %arrayidx2 = getelementptr inbounds [3 x i8]* %arrayidx, i32 0, i64 %idxprom
+  store i8 0, i8* %arrayidx2, align 1
+  store i32 0, i32* @e, align 4
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.inc, %for.body
+  %3 = load i32* @e, align 4
+  %cmp4 = icmp slt i32 %3, 3
+  br i1 %cmp4, label %for.body5, label %for.end
+
+for.body5:                                        ; preds = %for.cond3
+  %4 = load i32* @c, align 4
+  %tobool = icmp ne i32 %4, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body5
+  %5 = load i32* @a, align 4
+  %dec = add nsw i32 %5, -1
+  store i32 %dec, i32* @a, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body5
+  %6 = load i32* @e, align 4
+  %idxprom6 = sext i32 %6 to i64
+  %arrayidx7 = getelementptr inbounds [3 x i8]* getelementptr inbounds ([1 x [3 x i8]]* @f, i32 0, i64 0), i32 0, i64 %idxprom6
+  store i8 1, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %7 = load i32* @e, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* @e, align 4
+  br label %for.cond3
+
+for.end:                                          ; preds = %for.cond3
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.end
+  %8 = load i32* @d, align 4
+  %inc9 = add nsw i32 %8, 1
+  store i32 %inc9, i32* @d, align 4
+  br label %for.cond
+
+for.end10:                                        ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  call void @fn1()
+  %0 = load i8* getelementptr inbounds ([1 x [3 x i8]]* @f, i32 0, i64 0, i64 1), align 1
+  %conv = sext i8 %0 to i32
+  %cmp = icmp ne i32 %conv, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @abort() #2
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 0
+}
+
+; Function Attrs: noreturn nounwind
+declare void @abort() #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (trunk 229288) (llvm/trunk 229286:229290M)"}
--- a/test/Transforms/InstSimplify/load.ll
+++ b/test/Transforms/InstSimplify/load.ll
@ -0,0 +1,19 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+@zeroinit = constant {} zeroinitializer
+@undef = constant {} undef
+
+define i32 @crash_on_zeroinit() {
+; CHECK-LABEL: @crash_on_zeroinit
+; CHECK: ret i32 0
+  %load = load i32* bitcast ({}* @zeroinit to i32*)
+  ret i32 %load
+}
+
+define i32 @crash_on_undef() {
+; CHECK-LABEL: @crash_on_undef
+; CHECK: ret i32 undef
+  %load = load i32* bitcast ({}* @undef to i32*)
+  ret i32 %load
+}
+
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@ -4,9 +4,7 @@

 ; CHECK: %xtraiter = and i32 %n
 ; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
-; CHECK:  %lcmp.overflow = icmp eq i32 %n, 0
-; CHECK:  %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod
-; CHECK:  br i1 %lcmp.or, label %for.body.prol, label %for.body.preheader.split
+; CHECK:  br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split

 ; CHECK: for.body.prol:
 ; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ]
--- a/test/Transforms/LoopUnroll/runtime-loop1.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop1.ll
@ -3,7 +3,7 @@
 ; This tests that setting the unroll count works

 ; CHECK: for.body.prol:
-; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
+; CHECK: br label %for.body.preheader.split
 ; CHECK: for.body:
 ; CHECK: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body
 ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
--- a/test/Transforms/LoopUnroll/tripcount-overflow.ll
+++ b/test/Transforms/LoopUnroll/tripcount-overflow.ll
@ -1,19 +1,28 @@
 ; RUN: opt < %s -S -unroll-runtime -unroll-count=2 -loop-unroll | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"

-; When prologue is fully unrolled, the branch on its end is unconditional.
-; Unrolling it is illegal if we can't prove that trip-count+1 doesn't overflow,
-; like in this example, where it comes from an argument.
-;
-; This test is based on an example from here:
-; http://stackoverflow.com/questions/23838661/why-is-clang-optimizing-this-code-out
-;
+; This test case documents how runtime loop unrolling handles the case
+; when the backedge-count is -1.
+
+; If %N, the backedge-taken count, is -1 then %0 unsigned-overflows
+; and is 0.  %xtraiter too is 0, signifying that the total trip-count
+; is divisible by 2.  The prologue then branches to the unrolled loop
+; and executes the 2^32 iterations there, in groups of 2.
+
+
+; CHECK: entry:
+; CHECK-NEXT: %0 = add i32 %N, 1
+; CHECK-NEXT: %xtraiter = and i32 %0, 1
+; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0
+; CHECK-NEXT: br i1 %lcmp.mod, label %while.body.prol, label %entry.split
+
 ; CHECK: while.body.prol:
-; CHECK: br i1
+; CHECK: br label %entry.split
+
 ; CHECK: entry.split:

 ; Function Attrs: nounwind readnone ssp uwtable
-define i32 @foo(i32 %N) #0 {
+define i32 @foo(i32 %N) {
 entry:
  br label %while.body

@ -26,5 +35,3 @@ while.body:                                       ; preds = %while.body, %entry
 while.end:                                        ; preds = %while.body
  ret i32 %i
 }
-
-attributes #0 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@ -418,3 +418,85 @@ for.end:                                          ; preds = %for.cond
  ret void
 }

+; Reverse loop
+;void foo6(double *in, double *out, unsigned size, int *trigger) {
+;
+;  for (int i=SIZE-1; i>=0; i--) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i] + (double) 0.5;
+;    }
+;  }
+;}
+;AVX2-LABEL: @foo6
+;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
+;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
+;AVX2: call <4 x double> @llvm.masked.load.v4f64
+;AVX2: fadd <4 x double>
+;AVX2: call void @llvm.masked.store.v4f64
+;AVX2: ret void
+
+;AVX512-LABEL: @foo6
+;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
+;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
+;AVX512: call <8 x double> @llvm.masked.load.v8f64
+;AVX512: fadd <8 x double>
+;AVX512: call void @llvm.masked.store.v8f64
+;AVX512: ret void
+
+
+define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
+entry:
+  %in.addr = alloca double*, align 8
+  %out.addr = alloca double*, align 8
+  %size.addr = alloca i32, align 4
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %in, double** %in.addr, align 8
+  store double* %out, double** %out.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 4095, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp sge i32 %0, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
+  %6 = load double* %arrayidx3, align 8
+  %add = fadd double %6, 5.000000e-01
+  %7 = load i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double* %8, i64 %idxprom4
+  store double %add, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32* %i, align 4
+  %dec = add nsw i32 %9, -1
+  store i32 %dec, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
  MC
  MCJIT
  Object
+  RuntimeDyld
  SelectionDAG
  Support
  native
--- a/unittests/ExecutionEngine/CMakeLists.txt
+++ b/unittests/ExecutionEngine/CMakeLists.txt
@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
  ExecutionEngine
  Interpreter
  MC
+  RuntimeDyld
  Support
  )

--- a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
  IPO
  MC
  MCJIT
+  RuntimeDyld
  ScalarOpts
  Support
  Target