Vendor import of llvm release_80 branch r355313:

https://llvm.org/svn/llvm-project/llvm/branches/release_80@355313
2019-03-04 18:25:41 +00:00 · 2019-03-04 18:25:41 +00:00 · 1d6bb9f417
commit 1d6bb9f417
parent bd7f07563c
17 changed files with 322 additions and 127 deletions
--- a/.gitignore
+++ b/.gitignore
@ -72,6 +72,8 @@ docs/_build
 # VS2017 and VSCode config files.
 .vscode
 .vs
+# clangd index
+.clangd

 #==============================================================================#
 # Files created in tree by the Go bindings.
--- a/bindings/go/llvm/ir.go
+++ b/bindings/go/llvm/ir.go
@ -1263,7 +1263,7 @@ func (v Value) Indices() []uint32 {
 	num := C.LLVMGetNumIndices(v.C)
 	indicesPtr := C.LLVMGetIndices(v.C)
 	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	rawIndices := (*[1 << 30]C.uint)(unsafe.Pointer(indicesPtr))[:num:num]
+	rawIndices := (*[1 << 20]C.uint)(unsafe.Pointer(indicesPtr))[:num:num]
 	indices := make([]uint32, num)
 	for i := range indices {
 		indices[i] = uint32(rawIndices[i])
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -12,7 +12,7 @@ This document contains the release notes for the LLVM Compiler Infrastructure,
 release 8.0.0.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
-from the `LLVM releases web site <https://llvm.org/releases/>`_.
+from the `LLVM releases web site <https://releases.llvm.org/>`_.

 For more information about LLVM, including information about the latest
 release, please check out the `main LLVM web site <https://llvm.org/>`_.  If you
@ -39,14 +39,19 @@ setting the ``LLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN`` CMake variable to
 ``ON``.


+Known Issues
+============
+
+These are issues that couldn't be fixed before the release. See the bug reports
+for the latest status.
+
+* `PR40547 <https://llvm.org/pr40547>`_ Clang gets miscompiled by trunk GCC.
+
+* `PR40761 <https://llvm.org/pr40761>`_ "asan-dynamic" doesn't work on FreeBSD.
+
+
 Non-comprehensive list of changes in this release
 =================================================
-.. NOTE
-   For small 1-3 sentence descriptions, just add an entry at the end of
-   this list. If your description won't fit comfortably in one bullet
-   point (e.g. maybe you would like to give an example of the
-   functionality, or simply have a lot to talk about), see the `NOTE` below
-   for adding a new subsection.

 * The **llvm-cov** tool can now export lcov trace files using the
  `-format=lcov` option of the `export` command.
@ -80,15 +85,7 @@ Non-comprehensive list of changes in this release
  available in the `RFC
  <https://lists.llvm.org/pipermail/llvm-dev/2018-November/127461.html>`_.

-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
-
-   Special New Feature
-   -------------------
-
-   Makes programs 10x faster by doing Special New Thing.
+* Windows support for libFuzzer (x86_64).

 Changes to the LLVM IR
 ----------------------
@ -110,17 +107,12 @@ Changes to the AArch64 Target
  on ARM.


-Changes to the ARM Backend
--------------------------
-
- During this release ...
-
-
 Changes to the Hexagon Target
 -----------------------------

 * Added support for Hexagon/HVX V66 ISA.

+
 Changes to the MIPS Target
 --------------------------

@ -142,6 +134,7 @@ Changes to the MIPS Target

 * Numerous bug fixes and code cleanups.

+
 Changes to the PowerPC Target
 -----------------------------

@ -153,7 +146,7 @@ Changes to the PowerPC Target

 * Better overload rules for compatible vector type parameter

-* Support constraint ‘wi’, modifier ‘x’ and VSX registers in inline asm
+* Support constraint 'wi', modifier 'x' and VSX registers in inline asm

 * More ``__float128`` support

@ -198,15 +191,6 @@ Changes to the X86 Target
 * ADCX instruction will no longer be emitted. This instruction is rarely better
  than the legacy ADC instruction and just increased code size.

-Changes to the AMDGPU Target
-----------------------------
-
- During this release ...
-
-Changes to the AVR Target
-----------------------------
-
- During this release ...

 Changes to the WebAssembly Target
 ---------------------------------
@ -220,25 +204,16 @@ use for it will be to add support for returning small structs as multiple
 return values, once the underlying WebAssembly platform itself supports it.
 Additionally, multithreading support is not yet included in the stable ABI.

+
 Changes to the Nios2 Target
 ---------------------------

 * The Nios2 target was removed from this release.

-Changes to the OCaml bindings
-----------------------------
-
-
-
-Changes to the C API
--------------------
-
-
-Changes to the DAG infrastructure
---------------------------------

 Changes to LLDB
 ===============
+
 * Printed source code is now syntax highlighted in the terminal (only for C
  languages).

--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@ -471,9 +471,18 @@ void AArch64AsmPrinter::EmitJumpTableInfo() {
  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
  if (JT.empty()) return;

+  const Function &F = MF->getFunction();
  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
-  OutStreamer->SwitchSection(ReadOnlySec);
+  bool JTInDiffSection =
+      !STI->isTargetCOFF() ||
+      !TLOF.shouldPutJumpTableInFunctionSection(
+          MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
+          F);
+  if (JTInDiffSection) {
+      // Drop it in the readonly section.
+      MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(F, TM);
+      OutStreamer->SwitchSection(ReadOnlySec);
+  }

  auto AFI = MF->getInfo<AArch64FunctionInfo>();
  for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@ -2108,9 +2108,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
    ++MBBI;

-  if (MBBI->isTerminator())
-    return;
-
  // Create an UnwindHelp object.
  int UnwindHelpFI =
      MFI.CreateStackObject(/*size*/8, /*alignment*/16, false);
@ -2118,8 +2115,10 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
  // We need to store -2 into the UnwindHelp object at the start of the
  // function.
  DebugLoc DL;
-  RS->enterBasicBlock(MBB);
-  unsigned DstReg = RS->scavengeRegister(&AArch64::GPR64RegClass, MBBI, 0);
+  RS->enterBasicBlockEnd(MBB);
+  RS->backward(std::prev(MBBI));
+  unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
+  assert(DstReg && "There must be a free register after frame setup");
  BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
      .addReg(DstReg, getKillRegState(true))
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@ -209,8 +209,8 @@ static std::string computeDataLayout(const Triple &TT,

 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                           Optional<Reloc::Model> RM) {
-  // AArch64 Darwin is always PIC.
-  if (TT.isOSDarwin())
+  // AArch64 Darwin and Windows are always PIC.
+  if (TT.isOSDarwin() || TT.isOSWindows())
    return Reloc::PIC_;
  // On ELF platforms the default static relocation model has a smart enough
  // linker to cope with referencing external symbols defined in a shared
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@ -122,10 +122,3 @@ def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
          (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
          (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
-
-// The legalizer inserts an unnecessary `and 1` to make input conform
-// to getBooleanContents, which we can lower away.
-def : Pat<(select (i32 (and I32:$cond, 1)), I32:$lhs, I32:$rhs),
-          (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
-def : Pat<(select (i32 (and I32:$cond, 1)), I64:$lhs, I64:$rhs),
-          (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -1138,15 +1138,23 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
  if (AM.hasSymbolicDisplacement())
    return true;

+  bool IsRIPRelTLS = false;
  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+  if (IsRIPRel) {
+    SDValue Val = N.getOperand(0);
+    if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+      IsRIPRelTLS = true;
+  }

-  // We can't use an addressing mode in the 64-bit large code model. In the
-  // medium code model, we use can use an mode when RIP wrappers are present.
-  // That signifies access to globals that are known to be "near", such as the
-  // GOT itself.
+  // We can't use an addressing mode in the 64-bit large code model.
+  // Global TLS addressing is an exception. In the medium code model,
+  // we use can use a mode when RIP wrappers are present.
+  // That signifies access to globals that are known to be "near",
+  // such as the GOT itself.
  CodeModel::Model M = TM.getCodeModel();
  if (Subtarget->is64Bit() &&
-      (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
+      ((M == CodeModel::Large && !IsRIPRelTLS) ||
+       (M == CodeModel::Medium && !IsRIPRel)))
    return true;

  // Base and index reg must be 0 in order to use %rip as base.
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -38134,8 +38134,11 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
      return true;

    // See if this is a single use constant which can be constant folded.
-    SDValue BC = peekThroughOneUseBitcasts(Op);
-    return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
+    // NOTE: We don't peek throught bitcasts here because there is currently
+    // no support for constant folding truncate+bitcast+vector_of_constants. So
+    // we'll just send up with a truncate on both operands which will
+    // get turned back into (truncate (binop)) causing an infinite loop.
+    return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
  };

  auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
--- a/test/CodeGen/AArch64/win64-jumptable.ll
+++ b/test/CodeGen/AArch64/win64-jumptable.ll
@ -0,0 +1,48 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows -aarch64-enable-compress-jump-tables=0 | FileCheck %s
+
+define void @f(i32 %x) {
+entry:
+  switch i32 %x, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  tail call void @g(i32 0) #2
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %entry
+  tail call void @g(i32 1) #2
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %entry
+  tail call void @g(i32 2) #2
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @g(i32 3) #2
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  tail call void @g(i32 10) #2
+  ret void
+}
+
+declare void @g(i32)
+
+; CHECK:		.text
+; CHECK:		f:
+; CHECK:		.seh_proc f
+; CHECK:		b	g
+; CHECK-NEXT:	.p2align	2
+; CHECK-NEXT:	.LJTI0_0:
+; CHECK:		.word	.LBB0_2-.LJTI0_0
+; CHECK:		.word	.LBB0_3-.LJTI0_0
+; CHECK:		.word	.LBB0_4-.LJTI0_0
+; CHECK:		.word	.LBB0_5-.LJTI0_0
+; CHECK:		.section	.xdata,"dr"
+; CHECK:		.seh_handlerdata
+; CHECK:		.text
+; CHECK:		.seh_endproc
--- a/test/CodeGen/AArch64/wineh-try-catch-cbz.ll
+++ b/test/CodeGen/AArch64/wineh-try-catch-cbz.ll
@ -0,0 +1,40 @@
+; RUN: llc < %s | FileCheck %s
+
+; Make sure the prologue is sane.  (Doesn't need to exactly match this,
+; but the original issue only reproduced if the cbz was immediately
+; after the frame setup.)
+
+; CHECK:      sub     sp, sp, #32
+; CHECK-NEXT: stp     x29, x30, [sp, #16]
+; CHECK-NEXT: add     x29, sp, #16
+; CHECK-NEXT: orr     x1, xzr, #0xfffffffffffffffe
+; CHECK-NEXT: stur    x1, [x29, #-16]
+; CHECK-NEXT: cbz     w0, .LBB0_2
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+; Function Attrs: uwtable
+define dso_local void @"?f@@YAXH@Z"(i32 %x) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %try.cont, label %if.then
+
+if.then:                                          ; preds = %entry
+  invoke void @"?g@@YAXXZ"()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %if.then
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %if.then, %catch
+  ret void
+}
+
+declare dso_local void @"?g@@YAXXZ"() local_unnamed_addr #1
+
+declare dso_local i32 @__CxxFrameHandler3(...)
--- a/test/CodeGen/AArch64/wineh-try-catch.ll
+++ b/test/CodeGen/AArch64/wineh-try-catch.ll
@ -22,8 +22,8 @@
 ; CHECK:       add     x29, sp, #32
 ; CHECK:       sub     sp, sp, #624
 ; CHECK:       mov     x19, sp
-; CHECK:       orr     x1, xzr, #0xfffffffffffffffe
-; CHECK:       stur    x1, [x19]
+; CHECK:       orr     x0, xzr, #0xfffffffffffffffe
+; CHECK:       stur    x0, [x19]

 ; Now check that x is stored at fp - 20.  We check that this is the same
 ; location accessed from the funclet to retrieve x.
--- a/test/CodeGen/WebAssembly/select.ll
+++ b/test/CodeGen/WebAssembly/select.ll
@ -17,8 +17,10 @@ define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {

 ; CHECK-LABEL: select_i32_bool_nozext:
 ; CHECK-NEXT: .functype select_i32_bool_nozext (i32, i32, i32) -> (i32){{$}}
-; SLOW-NEXT: i32.select $push0=, $1, $2, $0{{$}}
-; SLOW-NEXT: return     $pop0{{$}}
+; SLOW-NEXT: i32.const  $push0=, 1{{$}}
+; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
+; SLOW-NEXT: i32.select $push2=, $1, $2, $pop1{{$}}
+; SLOW-NEXT: return     $pop2{{$}}
 define i32 @select_i32_bool_nozext(i1 %a, i32 %b, i32 %c) {
  %cond = select i1 %a, i32 %b, i32 %c
  ret i32 %cond
@ -55,8 +57,10 @@ define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {

 ; CHECK-LABEL: select_i64_bool_nozext:
 ; CHECK-NEXT: .functype select_i64_bool_nozext (i32, i64, i64) -> (i64){{$}}
-; SLOW-NEXT: i64.select $push0=, $1, $2, $0{{$}}
-; SLOW-NEXT: return     $pop0{{$}}
+; SLOW-NEXT: i32.const  $push0=, 1{{$}}
+; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
+; SLOW-NEXT: i64.select $push2=, $1, $2, $pop1{{$}}
+; SLOW-NEXT: return     $pop2{{$}}
 define i64 @select_i64_bool_nozext(i1 %a, i64 %b, i64 %c) {
  %cond = select i1 %a, i64 %b, i64 %c
  ret i64 %cond
@ -157,3 +161,16 @@ define double @select_f64_ne(i32 %a, double %b, double %c) {
  %cond = select i1 %cmp, double %b, double %c
  ret double %cond
 }
+
+; CHECK-LABEL: pr40805:
+; CHECK-NEXT: .functype pr40805 (i32, i32, i32) -> (i32){{$}}
+; SLOW-NEXT: i32.const  $push0=, 1{{$}}
+; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
+; SLOW-NEXT: i32.select $push2=, $1, $2, $pop1{{$}}
+; SLOW-NEXT: return     $pop2{{$}}
+define i32 @pr40805(i32 %x, i32 %y, i32 %z) {
+  %a = and i32 %x, 1
+  %b = icmp ne i32 %a, 0
+  %c = select i1 %b, i32 %y, i32 %z
+  ret i32 %c
+}
--- a/test/CodeGen/WebAssembly/simd-select.ll
+++ b/test/CodeGen/WebAssembly/simd-select.ll
@ -29,7 +29,7 @@ define <16 x i8> @vselect_v16i8(<16 x i1> %c, <16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <16 x i8> @select_v16i8(i1 %c, <16 x i8> %x, <16 x i8> %y) {
+define <16 x i8> @select_v16i8(i1 zeroext %c, <16 x i8> %x, <16 x i8> %y) {
  %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
  ret <16 x i8> %res
 }
@ -99,7 +99,7 @@ define <8 x i16> @vselect_v8i16(<8 x i1> %c, <8 x i16> %x, <8 x i16> %y) {
 ; CHECK-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <8 x i16> @select_v8i16(i1 %c, <8 x i16> %x, <8 x i16> %y) {
+define <8 x i16> @select_v8i16(i1 zeroext %c, <8 x i16> %x, <8 x i16> %y) {
  %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
  ret <8 x i16> %res
 }
@ -170,7 +170,7 @@ define <4 x i32> @vselect_v4i32(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <4 x i32> @select_v4i32(i1 %c, <4 x i32> %x, <4 x i32> %y) {
+define <4 x i32> @select_v4i32(i1 zeroext %c, <4 x i32> %x, <4 x i32> %y) {
  %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
  ret <4 x i32> %res
 }
@ -240,7 +240,7 @@ define <2 x i64> @vselect_v2i64(<2 x i1> %c, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <2 x i64> @select_v2i64(i1 %c, <2 x i64> %x, <2 x i64> %y) {
+define <2 x i64> @select_v2i64(i1 zeroext %c, <2 x i64> %x, <2 x i64> %y) {
  %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
  ret <2 x i64> %res
 }
@ -313,7 +313,7 @@ define <4 x float> @vselect_v4f32(<4 x i1> %c, <4 x float> %x, <4 x float> %y) {
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <4 x float> @select_v4f32(i1 %c, <4 x float> %x, <4 x float> %y) {
+define <4 x float> @select_v4f32(i1 zeroext %c, <4 x float> %x, <4 x float> %y) {
  %res = select i1 %c, <4 x float> %x, <4 x float> %y
  ret <4 x float> %res
 }
@ -383,7 +383,7 @@ define <2 x double> @vselect_v2f64(<2 x i1> %c, <2 x double> %x, <2 x double> %y
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <2 x double> @select_v2f64(i1 %c, <2 x double> %x, <2 x double> %y) {
+define <2 x double> @select_v2f64(i1 zeroext %c, <2 x double> %x, <2 x double> %y) {
  %res = select i1 %c, <2 x double> %x, <2 x double> %y
  ret <2 x double> %res
 }
--- a/test/CodeGen/X86/code-model-elf.ll
+++ b/test/CodeGen/X86/code-model-elf.ll
@ -37,6 +37,8 @@ target triple = "x86_64--linux"
@global_data = dso_local global [10 x i32] [i32 1, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0], align 16
@static_data = internal global [10 x i32] zeroinitializer, align 16
@extern_data = external global [10 x i32], align 16
+@thread_data = external thread_local global i32, align 4
+

 define dso_local i32* @lea_static_data() #0 {
 ; SMALL-STATIC-LABEL: lea_static_data:
@ -373,6 +375,70 @@ define dso_local void ()* @lea_extern_fn() #0 {
  ret void ()* @extern_fn
 }

+; FIXME: The result is same for small, medium and large model, because we
+; specify pie option in the test case. And the type of tls is initial exec tls.
+; For pic code. The large model code for pic tls should be emitted as below.
+
+; .L3:
+; leaq	.L3(%rip), %rbx
+; movabsq	$_GLOBAL_OFFSET_TABLE_-.L3, %r11
+; addq	%r11, %rbx
+; leaq	thread_data@TLSGD(%rip), %rdi
+; movabsq	$__tls_get_addr@PLTOFF, %rax
+; addq	%rbx, %rax
+; call	*%rax
+; movl	(%rax), %eax
+
+; The medium and small model code for pic tls should be emitted as below.
+; data16
+; leaq	thread_data@TLSGD(%rip), %rdi
+; data16
+; data16
+; rex64
+; callq	__tls_get_addr@PLT
+; movl	(%rax), %eax
+
+define dso_local i32 @load_thread_data() #0 {
+; SMALL-STATIC-LABEL: load_thread_data:
+; SMALL-STATIC:       # %bb.0:
+; SMALL-STATIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; SMALL-STATIC-NEXT:    movl    %fs:(%rax), %eax
+; SMALL-STATIC-NEXT:    retq
+;
+; MEDIUM-STATIC-LABEL: load_thread_data:
+; MEDIUM-STATIC:       # %bb.0:
+; MEDIUM-STATIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; MEDIUM-STATIC-NEXT:    movl    %fs:(%rax), %eax
+; MEDIUM-STATIC-NEXT:    retq
+;
+; LARGE-STATIC-LABEL: load_thread_data:
+; LARGE-STATIC:       # %bb.0:
+; LARGE-STATIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; LARGE-STATIC-NEXT:    movl    %fs:(%rax), %eax
+; LARGE-STATIC-NEXT:    retq
+;
+; SMALL-PIC-LABEL: load_thread_data:
+; SMALL-PIC:       # %bb.0:
+; SMALL-PIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; SMALL-PIC-NEXT:    movl    %fs:(%rax), %eax
+; SMALL-PIC-NEXT:    retq
+;
+; MEDIUM-PIC-LABEL: load_thread_data:
+; MEDIUM-PIC:       # %bb.0:
+; MEDIUM-PIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; MEDIUM-PIC-NEXT:    movl    %fs:(%rax), %eax
+; MEDIUM-PIC-NEXT:    retq
+;
+; LARGE-PIC-LABEL: load_thread_data:
+; LARGE-PIC:       # %bb.0:
+; LARGE-PIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; LARGE-PIC-NEXT:    movl    %fs:(%rax), %eax
+; LARGE-PIC-NEXT:    retq
+;
+  %1 = load i32, i32* @thread_data, align 4
+  ret i32 %1
+}
+
 attributes #0 = { noinline nounwind uwtable }

 !llvm.module.flags = !{!0, !1, !2}
--- a/test/CodeGen/X86/pr40891.ll
+++ b/test/CodeGen/X86/pr40891.ll
@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx2 | FileCheck %s
+
+; Make sure this sequence doesn't hang in DAG combine.
+
+define <8 x i32> @foo(<8 x i64> %x, <4 x i64> %y) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vandps {{\.LCPI.*}}, %ymm1, %ymm1
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retl
+  %a = shufflevector <4 x i64> %y, <4 x i64> <i64 12345, i64 67890, i64 13579, i64 24680>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %b = and <8 x i64> %x, %a
+  %c = trunc <8 x i64> %b to <8 x i32>
+  ret <8 x i32> %c
+}
+
--- a/tools/llvm-xray/xray-converter.cpp
+++ b/tools/llvm-xray/xray-converter.cpp
@ -18,7 +18,6 @@
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
@ -242,6 +241,31 @@ StackTrieNode *findOrCreateStackNode(
  return CurrentStack;
 }

+void writeTraceViewerRecord(uint16_t Version, raw_ostream &OS, int32_t FuncId,
+                            uint32_t TId, uint32_t PId, bool Symbolize,
+                            const FuncIdConversionHelper &FuncIdHelper,
+                            double EventTimestampUs,
+                            const StackTrieNode &StackCursor,
+                            StringRef FunctionPhenotype) {
+  OS << "    ";
+  if (Version >= 3) {
+    OS << llvm::formatv(
+        R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "{3}", )"
+        R"("ts" : "{4:f4}", "sf" : "{5}" })",
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+                   : llvm::to_string(FuncId)),
+        FunctionPhenotype, TId, PId, EventTimestampUs,
+        StackCursor.ExtraData.id);
+  } else {
+    OS << llvm::formatv(
+        R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "1", )"
+        R"("ts" : "{3:f3}", "sf" : "{4}" })",
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+                   : llvm::to_string(FuncId)),
+        FunctionPhenotype, TId, EventTimestampUs, StackCursor.ExtraData.id);
+  }
+}
+
 } // namespace

 void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
@ -252,14 +276,18 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,

  unsigned id_counter = 0;

+  OS << "{\n  \"traceEvents\": [";
  DenseMap<uint32_t, StackTrieNode *> StackCursorByThreadId{};
  DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> StackRootsByThreadId{};
  DenseMap<unsigned, StackTrieNode *> StacksByStackId{};
  std::forward_list<StackTrieNode> NodeStore{};
-
-  // Create a JSON Array which will hold all trace events.
-  json::Array TraceEvents;
+  int loop_count = 0;
  for (const auto &R : Records) {
+    if (loop_count++ == 0)
+      OS << "\n";
+    else
+      OS << ",\n";
+
    // Chrome trace event format always wants data in micros.
    // CyclesPerMicro = CycleHertz / 10^6
    // TSC / CyclesPerMicro == TSC * 10^6 / CycleHertz == MicroTimestamp
@ -284,15 +312,8 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
      // type of B for begin or E for end, thread id, process id,
      // timestamp in microseconds, and a stack frame id. The ids are logged
      // in an id dictionary after the events.
-      TraceEvents.push_back(json::Object({
-          {"name", Symbolize ? FuncIdHelper.SymbolOrNumber(R.FuncId)
-                             : llvm::to_string(R.FuncId)},
-          {"ph", "B"},
-          {"tid", llvm::to_string(R.TId)},
-          {"pid", llvm::to_string(Version >= 3 ? R.PId : 1)},
-          {"ts", llvm::formatv("{0:f4}", EventTimestampUs)},
-          {"sf", llvm::to_string(StackCursor->ExtraData.id)},
-      }));
+      writeTraceViewerRecord(Version, OS, R.FuncId, R.TId, R.PId, Symbolize,
+                             FuncIdHelper, EventTimestampUs, *StackCursor, "B");
      break;
    case RecordTypes::EXIT:
    case RecordTypes::TAIL_EXIT:
@ -303,51 +324,43 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
      // (And/Or in loop termination below)
      StackTrieNode *PreviousCursor = nullptr;
      do {
-        TraceEvents.push_back(json::Object({
-            {"name", Symbolize
-                         ? FuncIdHelper.SymbolOrNumber(StackCursor->FuncId)
-                         : llvm::to_string(StackCursor->FuncId)},
-            {"ph", "E"},
-            {"tid", llvm::to_string(R.TId)},
-            {"pid", llvm::to_string(Version >= 3 ? R.PId : 1)},
-            {"ts", llvm::formatv("{0:f4}", EventTimestampUs)},
-            {"sf", llvm::to_string(StackCursor->ExtraData.id)},
-        }));
+        if (PreviousCursor != nullptr) {
+          OS << ",\n";
+        }
+        writeTraceViewerRecord(Version, OS, StackCursor->FuncId, R.TId, R.PId,
+                               Symbolize, FuncIdHelper, EventTimestampUs,
+                               *StackCursor, "E");
        PreviousCursor = StackCursor;
        StackCursor = StackCursor->Parent;
      } while (PreviousCursor->FuncId != R.FuncId && StackCursor != nullptr);
      break;
    }
  }
+  OS << "\n  ],\n"; // Close the Trace Events array.
+  OS << "  "
+     << "\"displayTimeUnit\": \"ns\",\n";

  // The stackFrames dictionary substantially reduces size of the output file by
  // avoiding repeating the entire call stack of function names for each entry.
-  json::Object StackFrames;
-  for (const auto &Stack : StacksByStackId) {
-    const auto &StackId = Stack.first;
-    const auto &StackFunctionNode = Stack.second;
-    json::Object::iterator It;
-    std::tie(It, std::ignore) = StackFrames.insert({
-        llvm::to_string(StackId),
-        json::Object{
-            {"name",
-             Symbolize ? FuncIdHelper.SymbolOrNumber(StackFunctionNode->FuncId)
-                       : llvm::to_string(StackFunctionNode->FuncId)}},
-    });
-
-    if (StackFunctionNode->Parent != nullptr)
-      It->second.getAsObject()->insert(
-          {"parent", llvm::to_string(StackFunctionNode->Parent->ExtraData.id)});
+  OS << R"(  "stackFrames": {)";
+  int stack_frame_count = 0;
+  for (auto map_iter : StacksByStackId) {
+    if (stack_frame_count++ == 0)
+      OS << "\n";
+    else
+      OS << ",\n";
+    OS << "    ";
+    OS << llvm::formatv(
+        R"("{0}" : { "name" : "{1}")", map_iter.first,
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(map_iter.second->FuncId)
+                   : llvm::to_string(map_iter.second->FuncId)));
+    if (map_iter.second->Parent != nullptr)
+      OS << llvm::formatv(R"(, "parent": "{0}")",
+                          map_iter.second->Parent->ExtraData.id);
+    OS << " }";
  }
-
-  json::Object TraceJSON{
-      {"displayTimeUnit", "ns"},
-      {"traceEvents", std::move(TraceEvents)},
-      {"stackFrames", std::move(StackFrames)},
-  };
-
-  // Pretty-print the JSON using two spaces for indentations.
-  OS << formatv("{0:2}", json::Value(std::move(TraceJSON)));
+  OS << "\n  }\n"; // Close the stack frames map.
+  OS << "}\n";     // Close the JSON entry.
 }

 namespace llvm {