From 104bd8179fb5f6551c65c94ebcd0a4918b060189 Mon Sep 17 00:00:00 2001
From: Roman Divacky <rdivacky@FreeBSD.org>
Date: Fri, 2 Apr 2010 08:54:30 +0000
Subject: [PATCH] Update LLVM to r100181.

---
 Makefile                                      |    2 +-
 autoconf/configure.ac                         |    5 +
 configure                                     |    7 +-
 docs/CodeGenerator.html                       |    6 +-
 docs/ProgrammersManual.html                   |   44 +-
 docs/ReleaseNotes.html                        |   78 +-
 docs/SourceLevelDebugging.html                |   29 +-
 docs/TableGenFundamentals.html                |    5 +-
 docs/tutorial/OCamlLangImpl6.html             |    4 +-
 docs/tutorial/OCamlLangImpl7.html             |    4 +-
 .../Chapter6/myocamlbuild.ml                  |    2 +-
 .../Chapter7/myocamlbuild.ml                  |    2 +-
 include/llvm/ADT/PointerUnion.h               |    6 +-
 include/llvm/ADT/SmallVector.h                |   24 +-
 include/llvm/ADT/Statistic.h                  |   10 +
 include/llvm/ADT/StringMap.h                  |    8 +
 include/llvm/Analysis/DebugInfo.h             |   17 +-
 include/llvm/Analysis/Dominators.h            |   44 +-
 include/llvm/CodeGen/AsmPrinter.h             |    4 -
 include/llvm/CodeGen/DwarfWriter.h            |   10 +-
 include/llvm/CodeGen/LiveInterval.h           |   28 +-
 include/llvm/CodeGen/LiveIntervalAnalysis.h   |    4 +-
 include/llvm/CodeGen/LiveStackAnalysis.h      |    4 +-
 include/llvm/CodeGen/MachineOperand.h         |    5 +
 include/llvm/CodeGen/RuntimeLibcalls.h        |    4 +
 include/llvm/CodeGen/SelectionDAG.h           |   82 +-
 include/llvm/CodeGen/SelectionDAGISel.h       |    2 +
 include/llvm/CodeGen/SelectionDAGNodes.h      |    2 +-
 include/llvm/Function.h                       |    7 +-
 include/llvm/InlineAsm.h                      |   19 +-
 include/llvm/Instruction.h                    |   44 +-
 include/llvm/Instructions.h                   |   34 +-
 include/llvm/Intrinsics.td                    |   54 +-
 include/llvm/IntrinsicsARM.td                 |   14 +-
 include/llvm/IntrinsicsPowerPC.td             |   42 +-
 include/llvm/IntrinsicsX86.td                 |   68 +-
 include/llvm/LLVMContext.h                    |    6 +
 include/llvm/MC/MCAsmLayout.h                 |   61 +
 include/llvm/MC/MCAssembler.h                 |  225 ++--
 include/llvm/MC/MCContext.h                   |   15 +-
 include/llvm/MC/MCExpr.h                      |    5 -
 include/llvm/MC/MCInst.h                      |    9 +
 include/llvm/MC/MCObjectWriter.h              |    9 +-
 include/llvm/MC/MCSection.h                   |    7 +-
 include/llvm/MC/MCStreamer.h                  |    5 +-
 include/llvm/MC/MachObjectWriter.h            |    7 +-
 include/llvm/PassManagers.h                   |    4 +-
 include/llvm/Support/Allocator.h              |   40 +
 include/llvm/Support/CFG.h                    |   12 +-
 include/llvm/Support/CallSite.h               |  283 ++--
 include/llvm/Support/Casting.h                |   20 +-
 include/llvm/Support/DebugLoc.h               |   65 +-
 include/llvm/Support/FileUtilities.h          |   13 +
 include/llvm/Support/IRBuilder.h              |   22 +-
 include/llvm/Support/MathExtras.h             |   50 +-
 include/llvm/Support/Timer.h                  |  179 +--
 include/llvm/Support/ValueHandle.h            |    5 +-
 include/llvm/Support/raw_ostream.h            |    1 +
 include/llvm/System/Memory.h                  |    2 +-
 include/llvm/Target/TargetAsmBackend.h        |   25 +
 include/llvm/Target/TargetInstrDesc.h         |   19 +
 include/llvm/Target/TargetLowering.h          |   64 +-
 include/llvm/Target/TargetMachine.h           |    6 +-
 include/llvm/Target/TargetSelectionDAG.td     |   16 +-
 include/llvm/Transforms/Utils/BuildLibCalls.h |    6 +
 include/llvm/Transforms/Utils/SSAUpdater.h    |   32 +-
 include/llvm/Type.h                           |    8 +-
 include/llvm/Value.h                          |  102 +-
 lib/Analysis/CaptureTracking.cpp              |    2 +-
 lib/Analysis/DebugInfo.cpp                    |    5 +-
 lib/Analysis/IPA/CallGraphSCCPass.cpp         |   17 +-
 lib/Analysis/IPA/GlobalsModRef.cpp            |    2 +-
 lib/Analysis/InlineCost.cpp                   |    6 +-
 lib/Analysis/LiveValues.cpp                   |    2 +-
 lib/Analysis/LoopPass.cpp                     |   18 +-
 lib/Analysis/MemoryBuiltins.cpp               |    2 +-
 lib/Analysis/ProfileEstimatorPass.cpp         |    2 +-
 lib/Analysis/ProfileInfo.cpp                  |   18 +-
 lib/Analysis/ProfileInfoLoaderPass.cpp        |    2 +-
 lib/Analysis/ProfileVerifierPass.cpp          |    6 +-
 lib/Analysis/ScalarEvolutionExpander.cpp      |   13 +-
 lib/Archive/ArchiveWriter.cpp                 |    2 +-
 lib/AsmParser/LLLexer.h                       |    2 +-
 lib/AsmParser/LLParser.cpp                    |   73 +-
 lib/AsmParser/LLParser.h                      |   13 +-
 lib/Bitcode/Reader/BitcodeReader.cpp          |   10 +-
 lib/Bitcode/Writer/BitcodeWriter.cpp          |    4 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp         |   33 +-
 lib/CodeGen/AsmPrinter/DIE.cpp                |   31 +-
 lib/CodeGen/AsmPrinter/DIE.h                  |   38 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp         |  287 ++--
 lib/CodeGen/AsmPrinter/DwarfDebug.h           |   32 +-
 lib/CodeGen/AsmPrinter/DwarfException.cpp     |   59 +-
 lib/CodeGen/AsmPrinter/DwarfPrinter.cpp       |    4 +-
 lib/CodeGen/AsmPrinter/DwarfWriter.cpp        |   17 +-
 lib/CodeGen/BranchFolding.cpp                 |   12 +-
 lib/CodeGen/DwarfEHPrepare.cpp                |  322 ++++-
 lib/CodeGen/LiveInterval.cpp                  |   15 +-
 lib/CodeGen/LiveIntervalAnalysis.cpp          |   19 +-
 lib/CodeGen/LiveStackAnalysis.cpp             |    2 +-
 lib/CodeGen/LiveVariables.cpp                 |   10 +-
 lib/CodeGen/MachineBasicBlock.cpp             |   64 +-
 lib/CodeGen/MachineCSE.cpp                    |   20 +-
 lib/CodeGen/MachineFunction.cpp               |    4 +-
 lib/CodeGen/MachineModuleInfo.cpp             |    6 +-
 lib/CodeGen/OptimizeExts.cpp                  |   25 +-
 lib/CodeGen/PHIElimination.cpp                |    2 +-
 lib/CodeGen/PreAllocSplitting.cpp             |    2 +-
 lib/CodeGen/RegAllocLocal.cpp                 |  568 ++++----
 lib/CodeGen/ScheduleDAGInstrs.cpp             |   72 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |   46 +-
 lib/CodeGen/SelectionDAG/FastISel.cpp         |    7 +-
 .../SelectionDAG/FunctionLoweringInfo.cpp     |   13 +-
 lib/CodeGen/SelectionDAG/InstrEmitter.cpp     |  288 ++--
 lib/CodeGen/SelectionDAG/InstrEmitter.h       |   36 +-
 lib/CodeGen/SelectionDAG/SDNodeDbgValue.h     |   14 +-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |  137 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  284 ++--
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   29 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  117 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |   21 +-
 lib/CodeGen/SimpleRegisterCoalescing.cpp      |  115 +-
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp  |    4 +-
 lib/CodeGen/TwoAddressInstructionPass.cpp     |   12 +-
 lib/CodeGen/VirtRegRewriter.cpp               |   22 +-
 lib/ExecutionEngine/ExecutionEngine.cpp       |   83 +-
 .../Interpreter/ExternalFunctions.cpp         |    2 +
 lib/MC/MCAsmStreamer.cpp                      |   20 +-
 lib/MC/MCAssembler.cpp                        |  605 +++++----
 lib/MC/MCContext.cpp                          |   20 +-
 lib/MC/MCExpr.cpp                             |   26 +-
 lib/MC/MCInst.cpp                             |   17 +
 lib/MC/MCMachOStreamer.cpp                    |   89 +-
 lib/MC/MCParser/AsmParser.cpp                 |    4 +-
 lib/MC/MCSection.cpp                          |    6 +-
 lib/MC/MachObjectWriter.cpp                   |  158 ++-
 lib/Support/APFloat.cpp                       |  284 ++--
 lib/Support/APInt.cpp                         |   79 +-
 lib/Support/CommandLine.cpp                   |    4 +-
 lib/Support/Debug.cpp                         |    3 +-
 lib/Support/ErrorHandling.cpp                 |    5 +-
 lib/Support/MemoryBuffer.cpp                  |   43 +-
 lib/Support/Statistic.cpp                     |   65 +-
 lib/Support/Timer.cpp                         |  512 ++++----
 lib/Support/Triple.cpp                        |    2 +-
 lib/Support/raw_ostream.cpp                   |   46 +-
 lib/System/Unix/Mutex.inc                     |    6 -
 lib/System/Unix/Path.inc                      |    4 +-
 lib/System/Win32/Program.inc                  |   34 +-
 lib/System/Win32/Signals.inc                  |    1 +
 lib/Target/ARM/ARM.td                         |   21 +-
 lib/Target/ARM/ARMBaseInstrInfo.cpp           |   31 +-
 lib/Target/ARM/ARMISelDAGToDAG.cpp            |   96 +-
 lib/Target/ARM/ARMISelLowering.cpp            |    3 +
 lib/Target/ARM/ARMInstrFormats.td             |  195 +--
 lib/Target/ARM/ARMInstrInfo.td                |    2 +
 lib/Target/ARM/ARMInstrNEON.td                | 1167 +++++++++--------
 lib/Target/ARM/ARMInstrVFP.td                 |    8 +-
 lib/Target/ARM/ARMLoadStoreOptimizer.cpp      |    6 +-
 lib/Target/ARM/ARMSubtarget.cpp               |   15 +-
 lib/Target/ARM/ARMSubtarget.h                 |    5 +
 lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp   |   11 +-
 lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp  |    4 +-
 lib/Target/ARM/NEONPreAllocPass.cpp           |   18 +-
 lib/Target/ARM/README.txt                     |    3 +
 lib/Target/ARM/Thumb1InstrInfo.cpp            |    2 +
 lib/Target/ARM/Thumb2InstrInfo.cpp            |    4 +-
 lib/Target/Alpha/AlphaInstrInfo.cpp           |   15 +-
 lib/Target/Blackfin/BlackfinInstrInfo.td      |   27 +-
 lib/Target/Blackfin/BlackfinIntrinsics.td     |    6 +-
 lib/Target/Blackfin/BlackfinRegisterInfo.cpp  |    6 +-
 lib/Target/CellSPU/SPU.h                      |   67 -
 lib/Target/CellSPU/SPUISelDAGToDAG.cpp        |   10 +-
 lib/Target/CellSPU/SPUISelLowering.cpp        |    5 +-
 lib/Target/CellSPU/SPUInstrInfo.cpp           |   15 +-
 lib/Target/CellSPU/SPUInstrInfo.td            |   13 +-
 lib/Target/CellSPU/SPURegisterInfo.cpp        |   87 +-
 lib/Target/CellSPU/SPURegisterInfo.h          |   19 +
 lib/Target/MBlaze/MBlazeIntrinsics.td         |    4 +-
 lib/Target/MSIL/MSILWriter.cpp                |    2 +-
 .../MSP430/AsmPrinter/MSP430MCInstLower.cpp   |    4 +-
 lib/Target/MSP430/MSP430InstrInfo.cpp         |    5 +
 lib/Target/Mangler.cpp                        |    5 +-
 lib/Target/Mips/MipsInstrFPU.td               |   14 +-
 lib/Target/Mips/MipsInstrInfo.cpp             |   15 +-
 lib/Target/PIC16/PIC16InstrInfo.cpp           |    5 +
 lib/Target/PIC16/PIC16Section.cpp             |   18 +-
 lib/Target/PIC16/PIC16Section.h               |   17 +-
 .../PowerPC/AsmPrinter/PPCAsmPrinter.cpp      |   13 +-
 lib/Target/PowerPC/PPCBranchSelector.cpp      |    2 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp        |   20 +-
 lib/Target/PowerPC/PPCISelLowering.cpp        |   12 +-
 lib/Target/PowerPC/PPCISelLowering.h          |   13 +-
 lib/Target/PowerPC/PPCInstrAltivec.td         |   17 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp           |   15 +-
 lib/Target/PowerPC/PPCRegisterInfo.cpp        |   30 +-
 lib/Target/SystemZ/SystemZInstrFP.td          |   12 +-
 lib/Target/SystemZ/SystemZInstrInfo.cpp       |    4 +
 lib/Target/SystemZ/SystemZInstrInfo.td        |   64 +-
 lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp   |    5 +
 lib/Target/X86/AsmPrinter/X86MCInstLower.cpp  |   12 +-
 lib/Target/X86/CMakeLists.txt                 |    1 +
 lib/Target/X86/SSEDomainFix.cpp               |  499 +++++++
 lib/Target/X86/X86.h                          |    4 +
 lib/Target/X86/X86.td                         |   15 +-
 lib/Target/X86/X86AsmBackend.cpp              |  131 ++
 lib/Target/X86/X86FastISel.cpp                |    4 +-
 lib/Target/X86/X86ISelLowering.cpp            |  214 +--
 lib/Target/X86/X86ISelLowering.h              |   15 +-
 lib/Target/X86/X86Instr64bit.td               |  283 ++--
 lib/Target/X86/X86InstrFormats.td             |   69 +-
 lib/Target/X86/X86InstrInfo.cpp               |   79 +-
 lib/Target/X86/X86InstrInfo.h                 |   14 +-
 lib/Target/X86/X86InstrInfo.td                |  720 +++++-----
 lib/Target/X86/X86InstrMMX.td                 |   13 -
 lib/Target/X86/X86InstrSSE.td                 |  128 +-
 lib/Target/X86/X86Subtarget.cpp               |    4 +
 lib/Target/X86/X86Subtarget.h                 |    4 +
 lib/Target/X86/X86TargetMachine.cpp           |   10 +
 lib/Target/X86/X86TargetMachine.h             |    1 +
 lib/Target/XCore/XCoreInstrInfo.cpp           |   15 +-
 lib/Target/XCore/XCoreInstrInfo.td            |    2 +-
 lib/Transforms/IPO/ArgumentPromotion.cpp      |   15 +-
 .../IPO/DeadArgumentElimination.cpp           |   97 +-
 lib/Transforms/IPO/GlobalOpt.cpp              |   74 +-
 lib/Transforms/IPO/PruneEH.cpp                |    2 +-
 .../InstCombine/InstCombineCalls.cpp          |    2 +-
 lib/Transforms/Scalar/ABCD.cpp                |  247 ++--
 lib/Transforms/Scalar/CodeGenPrepare.cpp      |    8 +-
 lib/Transforms/Scalar/GVN.cpp                 |   14 +-
 lib/Transforms/Scalar/IndVarSimplify.cpp      |   34 +-
 lib/Transforms/Scalar/LoopStrengthReduce.cpp  |    7 +-
 lib/Transforms/Scalar/Reg2Mem.cpp             |    2 +-
 lib/Transforms/Scalar/SCCP.cpp                |   19 +-
 lib/Transforms/Scalar/SimplifyCFGPass.cpp     |    2 +-
 lib/Transforms/Scalar/SimplifyLibCalls.cpp    |   26 +-
 lib/Transforms/Utils/AddrModeMatcher.cpp      |    5 +-
 lib/Transforms/Utils/BreakCriticalEdges.cpp   |    2 +-
 lib/Transforms/Utils/BuildLibCalls.cpp        |   28 +-
 lib/Transforms/Utils/LowerInvoke.cpp          |    4 +-
 .../Utils/PromoteMemoryToRegister.cpp         |    2 +-
 lib/Transforms/Utils/SSAUpdater.cpp           |  503 ++++---
 lib/Transforms/Utils/SimplifyCFG.cpp          |    8 +-
 lib/VMCore/AsmWriter.cpp                      |    9 +-
 lib/VMCore/AutoUpgrade.cpp                    |   17 +
 lib/VMCore/CMakeLists.txt                     |    2 +
 lib/VMCore/Constants.cpp                      |   20 +-
 lib/VMCore/ConstantsContext.h                 |   55 +-
 lib/VMCore/Core.cpp                           |    8 +-
 lib/VMCore/DebugLoc.cpp                       |  288 ++++
 lib/VMCore/Function.cpp                       |   18 +-
 lib/VMCore/Globals.cpp                        |    4 +-
 lib/VMCore/IRBuilder.cpp                      |   13 -
 lib/VMCore/InlineAsm.cpp                      |   25 +-
 lib/VMCore/Instruction.cpp                    |    2 +-
 lib/VMCore/Instructions.cpp                   |   39 +-
 lib/VMCore/LLVMContext.cpp                    |   57 +-
 lib/VMCore/LLVMContextImpl.cpp                |  103 ++
 lib/VMCore/LLVMContextImpl.h                  |  119 +-
 lib/VMCore/Metadata.cpp                       |  143 +-
 lib/VMCore/Module.cpp                         |    2 +-
 lib/VMCore/PassManager.cpp                    |   54 +-
 lib/VMCore/Type.cpp                           |   10 +-
 lib/VMCore/Value.cpp                          |    6 +-
 lib/VMCore/ValueSymbolTable.cpp               |   10 +-
 lib/VMCore/Verifier.cpp                       |   87 +-
 test/Bitcode/sse41_pmulld.ll                  |    2 +
 test/Bitcode/sse41_pmulld.ll.bc               |  Bin 0 -> 560 bytes
 test/CodeGen/ARM/fabss.ll                     |    3 +-
 test/CodeGen/ARM/fadds.ll                     |    3 +-
 test/CodeGen/ARM/fdivs.ll                     |    3 +-
 test/CodeGen/ARM/fmacs.ll                     |    3 +-
 test/CodeGen/ARM/fmscs.ll                     |    3 +-
 test/CodeGen/ARM/fmuls.ll                     |    3 +-
 test/CodeGen/ARM/fnegs.ll                     |    3 +-
 test/CodeGen/ARM/fnmacs.ll                    |    4 +-
 test/CodeGen/ARM/fnmscs.ll                    |    3 +-
 test/CodeGen/ARM/fp_convert.ll                |    3 +-
 test/CodeGen/ARM/fsubs.ll                     |    4 +-
 test/CodeGen/CellSPU/bigstack.ll              |   17 +
 test/CodeGen/Generic/2010-ZeroSizedArg.ll     |   17 +
 test/CodeGen/Generic/addr-label.ll            |   23 +
 test/CodeGen/PIC16/2009-07-17-PR4566-pic16.ll |    1 +
 test/CodeGen/PIC16/2009-11-20-NewNode.ll      |    1 +
 test/CodeGen/PIC16/C16-15.ll                  |    1 +
 test/CodeGen/PIC16/global-in-user-section.ll  |    1 +
 test/CodeGen/PIC16/globals.ll                 |    1 +
 test/CodeGen/PIC16/sext.ll                    |    1 +
 .../PowerPC/2010-04-01-MachineCSEBug.ll       |   70 +
 test/CodeGen/PowerPC/eqv-andc-orc-nor.ll      |   24 +-
 .../PowerPC/tango.net.ftp.FtpClient.ll        |    4 +-
 .../CodeGen/Thumb2/2009-08-04-CoalescerBug.ll |    2 +-
 .../Thumb2/2009-08-04-ScavengerAssert.ll      |    2 +-
 .../Thumb2/2009-08-04-SubregLoweringBug.ll    |    4 +-
 .../Thumb2/2009-08-04-SubregLoweringBug2.ll   |    2 +-
 .../Thumb2/2009-08-04-SubregLoweringBug3.ll   |    2 +-
 test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll   |    2 +-
 test/CodeGen/X86/2007-01-13-StackPtrIndex.ll  |    1 -
 test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll      |   10 +-
 test/CodeGen/X86/2009-02-05-CoalescerBug.ll   |    6 +-
 test/CodeGen/X86/2009-02-26-MachineLICMBug.ll |   11 +-
 test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll |    2 +-
 test/CodeGen/X86/byval7.ll                    |    9 +-
 test/CodeGen/X86/coalesce-esp.ll              |    2 +-
 test/CodeGen/X86/dagcombine-buildvector.ll    |   10 +-
 test/CodeGen/X86/gather-addresses.ll          |    2 +-
 test/CodeGen/X86/licm-symbol.ll               |    2 +-
 test/CodeGen/X86/memcpy-2.ll                  |  117 +-
 test/CodeGen/X86/memset-2.ll                  |   54 +-
 test/CodeGen/X86/memset64-on-x86-32.ll        |    5 +-
 test/CodeGen/X86/pic.ll                       |    8 +-
 test/CodeGen/X86/pmul.ll                      |    2 +-
 test/CodeGen/X86/pmulld.ll                    |   16 +
 test/CodeGen/X86/postalloc-coalescing.ll      |    2 +-
 test/CodeGen/X86/pr2659.ll                    |    6 +
 test/CodeGen/X86/sibcall.ll                   |   42 +
 test/CodeGen/X86/small-byval-memcpy.ll        |    6 +-
 test/CodeGen/X86/sse-align-12.ll              |   22 +-
 test/CodeGen/X86/sse-align-6.ll               |    2 +-
 test/CodeGen/X86/sse3.ll                      |    8 +-
 test/CodeGen/X86/unaligned-load.ll            |   24 +-
 test/CodeGen/X86/vec_compare.ll               |    6 +-
 .../X86/{vec_insert_4.ll => vec_insert-4.ll}  |    0
 test/CodeGen/X86/vec_insert-9.ll              |    9 +
 test/CodeGen/X86/vec_return.ll                |    2 +-
 test/CodeGen/X86/vec_set.ll                   |    2 +-
 test/CodeGen/X86/vec_shuffle-7.ll             |    2 +-
 test/CodeGen/X86/vec_shuffle-9.ll             |    2 +-
 test/CodeGen/X86/vec_shuffle.ll               |    3 +-
 test/CodeGen/X86/vec_zero.ll                  |    4 +-
 test/CodeGen/X86/vec_zero_cse.ll              |    3 +-
 test/CodeGen/X86/widen_arith-5.ll             |    2 +-
 test/CodeGen/X86/widen_cast-2.ll              |    2 +-
 test/CodeGen/X86/widen_load-2.ll              |   46 +-
 test/CodeGen/X86/xor-icmp.ll                  |    4 +-
 test/CodeGen/X86/xor.ll                       |    2 +-
 .../2009-11-03-InsertExtractValue.ll          |    8 +-
 test/DebugInfo/2010-03-22-CU-HighLow.ll       |    9 +
 test/DebugInfo/2010-03-24-MemberFn.ll         |   62 +
 .../2010-03-30-InvalidDbgInfoCrash.ll         |   30 +
 test/Feature/unions.ll                        |    2 +
 .../2010-03-22-empty-baseclass.cpp            |  134 ++
 test/FrontendObjC/2010-03-17-StructRef.m      |    2 +-
 test/MC/AsmParser/X86/x86_32-bit_cat.s        |   36 +
 test/MC/AsmParser/X86/x86_32-encoding.s       |   56 +
 test/MC/MachO/absolutize.s                    |  190 ++-
 test/MC/MachO/darwin-x86_64-reloc.s           |   71 +-
 test/TableGen/2010-03-24-PrematureDefaults.td |   44 +
 .../GVN/2010-03-31-RedundantPHIs.ll           |   46 +
 test/Transforms/GVN/rle.ll                    |   12 +
 test/Transforms/Inline/noinline.ll            |   18 +
 test/Transforms/InstCombine/objsize.ll        |   15 +-
 .../SimplifyCFG/2010-03-30-InvokeCrash.ll     |   18 +
 test/Transforms/SimplifyLibCalls/StrCpy.ll    |   45 +-
 tools/Makefile                                |    8 +-
 tools/bugpoint/BugDriver.cpp                  |    5 +-
 tools/bugpoint/BugDriver.h                    |    1 +
 tools/edis/Makefile                           |   17 +-
 tools/llc/llc.cpp                             |    4 -
 tools/llvm-extract/llvm-extract.cpp           |    1 -
 tools/llvm-ld/Optimize.cpp                    |    5 -
 tools/llvm-ld/llvm-ld.cpp                     |  139 +-
 tools/llvm-link/llvm-link.cpp                 |    1 -
 tools/llvm-mc/llvm-mc.cpp                     |   10 +-
 tools/llvmc/plugins/Base/Base.td.in           |    2 +
 tools/lto/LTOCodeGenerator.cpp                |    4 -
 tools/opt/opt.cpp                             |    5 +-
 unittests/ADT/SmallVectorTest.cpp             |    6 +
 utils/TableGen/CodeGenDAGPatterns.cpp         |  529 ++++----
 utils/TableGen/CodeGenDAGPatterns.h           |   15 +-
 utils/TableGen/CodeGenInstruction.cpp         |   20 +
 utils/TableGen/CodeGenInstruction.h           |    7 +
 utils/TableGen/CodeGenTarget.cpp              |   25 +-
 utils/TableGen/DAGISelEmitter.cpp             |   74 +-
 utils/TableGen/DAGISelMatcher.cpp             |   14 +-
 utils/TableGen/DAGISelMatcher.h               |    8 +-
 utils/TableGen/DAGISelMatcherEmitter.cpp      |   57 +-
 utils/TableGen/DAGISelMatcherGen.cpp          |   97 +-
 utils/TableGen/DAGISelMatcherOpt.cpp          |   11 +-
 utils/TableGen/FastISelEmitter.cpp            |    4 +-
 utils/TableGen/InstrInfoEmitter.cpp           |    2 +-
 utils/TableGen/IntrinsicEmitter.cpp           |   13 +-
 utils/TableGen/Record.cpp                     |   30 +-
 utils/TableGen/Record.h                       |   12 +-
 utils/TableGen/TableGen.cpp                   |    1 -
 utils/buildit/build_llvm                      |    2 +
 utils/lit/lit/LitTestCase.py                  |   30 +
 utils/lit/lit/TestFormats.py                  |    5 +-
 utils/lit/lit/TestRunner.py                   |   10 +-
 utils/lit/lit/lit.py                          |   42 +
 390 files changed, 10314 insertions(+), 6109 deletions(-)
 create mode 100644 lib/Target/X86/SSEDomainFix.cpp
 create mode 100644 lib/VMCore/DebugLoc.cpp
 create mode 100644 lib/VMCore/LLVMContextImpl.cpp
 create mode 100644 test/Bitcode/sse41_pmulld.ll
 create mode 100644 test/Bitcode/sse41_pmulld.ll.bc
 create mode 100644 test/CodeGen/CellSPU/bigstack.ll
 create mode 100644 test/CodeGen/Generic/2010-ZeroSizedArg.ll
 create mode 100644 test/CodeGen/PowerPC/2010-04-01-MachineCSEBug.ll
 create mode 100644 test/CodeGen/X86/pmulld.ll
 rename test/CodeGen/X86/{vec_insert_4.ll => vec_insert-4.ll} (100%)
 create mode 100644 test/CodeGen/X86/vec_insert-9.ll
 create mode 100644 test/DebugInfo/2010-03-22-CU-HighLow.ll
 create mode 100644 test/DebugInfo/2010-03-24-MemberFn.ll
 create mode 100644 test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
 create mode 100644 test/FrontendC++/2010-03-22-empty-baseclass.cpp
 create mode 100644 test/TableGen/2010-03-24-PrematureDefaults.td
 create mode 100644 test/Transforms/GVN/2010-03-31-RedundantPHIs.ll
 create mode 100644 test/Transforms/Inline/noinline.ll
 create mode 100644 test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll
 create mode 100644 utils/lit/lit/LitTestCase.py

diff --git a/Makefile b/Makefile
index f5a9b336e374..efbae8ddedaf 100644
--- a/Makefile
+++ b/Makefile
@@ -214,7 +214,7 @@ update:
 	$(SVN) $(SVN-UPDATE-OPTIONS) update $(LLVM_SRC_ROOT)
 	@ $(SVN) status $(LLVM_SRC_ROOT) | $(SUB-SVN-DIRS) | xargs $(SVN) $(SVN-UPDATE-OPTIONS) update
 
-happiness: update all check unittests
+happiness: update all check-all
 
 .PHONY: srpm rpm update happiness
 
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 56d716bb8da7..04c0886ab78d 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -110,6 +110,11 @@ do
       llvm-tv)      AC_CONFIG_SUBDIRS([projects/llvm-tv])   ;;
       safecode)     AC_CONFIG_SUBDIRS([projects/safecode]) ;;
       llvm-kernel)  AC_CONFIG_SUBDIRS([projects/llvm-kernel]) ;;
+      llvm-gcc)       ;;
+      test-suite)     ;;
+      llvm-test)      ;;
+      poolalloc)      ;;
+      llvm-poolalloc) ;;
       *)
         AC_MSG_WARN([Unknown project (${i}) won't be configured automatically])
         ;;
diff --git a/configure b/configure
index a2aad3ea3ced..2836e9ea32b3 100755
--- a/configure
+++ b/configure
@@ -1999,6 +1999,11 @@ do
  ;;
       llvm-kernel)  subdirs="$subdirs projects/llvm-kernel"
  ;;
+      llvm-gcc)       ;;
+      test-suite)     ;;
+      llvm-test)      ;;
+      poolalloc)      ;;
+      llvm-poolalloc) ;;
       *)
         { echo "$as_me:$LINENO: WARNING: Unknown project (${i}) won't be configured automatically" >&5
 echo "$as_me: WARNING: Unknown project (${i}) won't be configured automatically" >&2;}
@@ -11151,7 +11156,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 11154 "configure"
+#line 11159 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
diff --git a/docs/CodeGenerator.html b/docs/CodeGenerator.html
index c8a0cd59721b..582e252ea18c 100644
--- a/docs/CodeGenerator.html
+++ b/docs/CodeGenerator.html
@@ -1090,8 +1090,8 @@ def FADDS : AForm_2&lt;59, 21,
 <p>The portion of the instruction definition in bold indicates the pattern used
    to match the instruction.  The DAG operators
    (like <tt>fmul</tt>/<tt>fadd</tt>) are defined in
-   the <tt>lib/Target/TargetSelectionDAG.td</tt> file.  "<tt>F4RC</tt>" is the
-   register class of the input and result values.</p>
+   the <tt>include/llvm/Target/TargetSelectionDAG.td</tt> file.  "
+   <tt>F4RC</tt>" is the register class of the input and result values.</p>
 
 <p>The TableGen DAG instruction selector generator reads the instruction
    patterns in the <tt>.td</tt> file and automatically builds parts of the
@@ -2162,7 +2162,7 @@ MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-03-11 01:22:57 +0100 (Thu, 11 Mar 2010) $
+  Last modified: $Date: 2010-03-25 01:03:04 +0100 (Thu, 25 Mar 2010) $
 </address>
 
 </body>
diff --git a/docs/ProgrammersManual.html b/docs/ProgrammersManual.html
index ace44e7d181e..23cfbf60664e 100644
--- a/docs/ProgrammersManual.html
+++ b/docs/ProgrammersManual.html
@@ -1211,14 +1211,14 @@ and erasing, but does not support iteration.</p>
 
 <div class="doc_text">
 
-<p>SmallPtrSet has all the advantages of SmallSet (and a SmallSet of pointers is 
-transparently implemented with a SmallPtrSet), but also supports iterators.  If
+<p>SmallPtrSet has all the advantages of <tt>SmallSet</tt> (and a <tt>SmallSet</tt> of pointers is 
+transparently implemented with a <tt>SmallPtrSet</tt>), but also supports iterators.  If
 more than 'N' insertions are performed, a single quadratically
 probed hash table is allocated and grows as needed, providing extremely
 efficient access (constant time insertion/deleting/queries with low constant
 factors) and is very stingy with malloc traffic.</p>
 
-<p>Note that, unlike std::set, the iterators of SmallPtrSet are invalidated
+<p>Note that, unlike <tt>std::set</tt>, the iterators of <tt>SmallPtrSet</tt> are invalidated
 whenever an insertion occurs.  Also, the values visited by the iterators are not
 visited in sorted order.</p>
 
@@ -1843,6 +1843,21 @@ void printNextInstruction(Instruction* inst) {
 </pre>
 </div>
 
+<p>Unfortunately, these implicit conversions come at a cost; they prevent
+these iterators from conforming to standard iterator conventions, and thus
+from being usable with standard algorithms and containers. For example, they
+prevent the following code, where <tt>B</tt> is a <tt>BasicBlock</tt>,
+from compiling:</p>
+
+<div class="doc_code">
+<pre>
+  llvm::SmallVector&lt;llvm::Instruction *, 16&gt;(B-&gt;begin(), B-&gt;end());
+</pre>
+</div>
+
+<p>Because of this, these implicit conversions may be removed some day,
+and <tt>operator*</tt> changed to return a pointer instead of a reference.</p>
+
 </div>
 
 <!--_______________________________________________________________________-->
@@ -1962,7 +1977,11 @@ for (Value::use_iterator i = F-&gt;use_begin(), e = F-&gt;use_end(); i != e; ++i
 </pre>
 </div>
 
-<p>Alternately, it's common to have an instance of the <a
+<p>Note that dereferencing a <tt>Value::use_iterator</tt> is not a very cheap
+operation. Instead of performing <tt>*i</tt> above several times, consider
+doing it only once in the loop body and reusing its result.</p>
+
+<p>Alternatively, it's common to have an instance of the <a
 href="/doxygen/classllvm_1_1User.html">User Class</a> and need to know what
 <tt>Value</tt>s are used by it.  The list of all <tt>Value</tt>s used by a
 <tt>User</tt> is known as a <i>use-def</i> chain.  Instances of class
@@ -1981,10 +2000,13 @@ for (User::op_iterator i = pi-&gt;op_begin(), e = pi-&gt;op_end(); i != e; ++i)
 </pre>
 </div>
 
-<!--
-  def-use chains ("finding all users of"): Value::use_begin/use_end
-  use-def chains ("finding all values used"): User::op_begin/op_end [op=operand]
--->
+<p>Declaring objects as <tt>const</tt> is an important tool of enforcing
+mutation free algorithms (such as analyses, etc.). For this purpose above
+iterators come in constant flavors as <tt>Value::const_use_iterator</tt>
+and <tt>Value::const_op_iterator</tt>.  They automatically arise when
+calling <tt>use/op_begin()</tt> on <tt>const Value*</tt>s or
+<tt>const User*</tt>s respectively.  Upon dereferencing, they return
+<tt>const Use*</tt>s. Otherwise the above patterns remain unchanged.</p>
 
 </div>
 
@@ -3058,7 +3080,7 @@ the <tt>lib/VMCore</tt> directory.</p>
   <dt><tt><a name="FunctionType">FunctionType</a></tt></dt>
   <dd>Subclass of DerivedTypes for function types.
     <ul>
-      <li><tt>bool isVarArg() const</tt>: Returns true if its a vararg
+      <li><tt>bool isVarArg() const</tt>: Returns true if it's a vararg
       function</li>
       <li><tt> const Type * getReturnType() const</tt>: Returns the
       return type of the function.</li>
@@ -3276,7 +3298,7 @@ simplifies the representation and makes it easier to manipulate.</p>
 <ul>
   <li><tt>Value::use_iterator</tt> - Typedef for iterator over the
 use-list<br>
-    <tt>Value::use_const_iterator</tt> - Typedef for const_iterator over
+    <tt>Value::const_use_iterator</tt> - Typedef for const_iterator over
 the use-list<br>
     <tt>unsigned use_size()</tt> - Returns the number of users of the
 value.<br>
@@ -3921,7 +3943,7 @@ arguments. An argument has a pointer to the parent Function.</p>
   <a href="mailto:dhurjati@cs.uiuc.edu">Dinakar Dhurjati</a> and
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-02-26 00:51:27 +0100 (Fri, 26 Feb 2010) $
+  Last modified: $Date: 2010-04-02 02:08:26 +0200 (Fri, 02 Apr 2010) $
 </address>
 
 </body>
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 53a018aefbc7..8470e8356bce 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -127,8 +127,21 @@ development.  Here we include updates on these subprojects.
 <p>In the LLVM 2.7 time-frame, the Clang team has made many improvements:</p>
 
 <ul>
-<li>...</li>
-include a link to cxx_compatibility.html
+<li>FIXME: C++! Include a link to cxx_compatibility.html</li>
+
+<li>FIXME: Static Analyzer improvements?</li>
+
+<li>CIndex API and Python bindings: Clang now includes a C API as part of the
+CIndex library. Although we make make some changes to the API in the future, it
+is intended to be stable and has been designed for use by external projects. See
+the Clang
+doxygen <a href="http://clang.llvm.org/doxygen/group__CINDEX.html">CIndex</a>
+documentation for more details. The CIndex API also includings an preliminary
+set of Python bindings.</li>
+
+<li>ARM Support: Clang now has ABI support for both the Darwin and Linux ARM
+ABIs. Coupled with many improvements to the LLVM ARM backend, Clang is now
+suitable for use as a a beta quality ARM compiler.</li>
 </ul>
 </div>
 
@@ -162,13 +175,23 @@ implementation of the CLI) using LLVM for static and just-in-time
 compilation.</p>
 
 <p>
-VMKit version ?? builds with LLVM 2.7 and you can find it on its
-<a href="http://vmkit.llvm.org/releases/">web page</a>. The release includes
-bug fixes, cleanup and new features. The major changes are:</p>
+With the release of LLVM 2.7, VMKit has shifted to a great framework for writing
+virtual machines. VMKit now offers precise and efficient garbage collection with
+multi-threading support, thanks to the MMTk memory management toolkit, as well
+as just in time and ahead of time compilation with LLVM.  The major changes in
+VMKit 0.27 are:</p>
 
 <ul>
 
-<li>...</li>
+<li>Garbage collection: VMKit now uses the MMTk toolkit for garbage collectors.
+  The first collector to be ported is the MarkSweep collector, which is precise,
+  and drastically improves the performance of VMKit.</li>
+<li>Line number information in the JVM: by using the debug metadata of LLVM, the
+ JVM now supports precise line number information, useful when printing a stack
+ trace.</li>
+<li>Interface calls in the JVM: we implemented a variant of the Interface Method
+  Table technique for interface calls in the JVM.
+</li>
 
 </ul>
 </div>
@@ -391,6 +414,27 @@ code.-->
 </div>
 
 
+<!--=========================================================================-->
+<div class="doc_subsection">
+<a name="tce">TTA-based Codesign Environment (TCE)</a>
+</div>
+
+<div class="doc_text">
+<p>
+<a href="http://tce.cs.tut.fi/">TCE</a> is a toolset for designing
+application-specific processors (ASP) based on the Transport triggered
+architecture (TTA). The toolset provides a complete co-design flow from C/C++
+programs down to synthesizable VHDL and parallel program binaries. Processor
+customization points include the register files, function units, supported
+operations, and the interconnection network.</p>
+
+<p>TCE uses llvm-gcc/Clang and LLVM for C/C++ language support, target
+independent optimizations and also for parts of code generation. It generates
+new LLVM-based code generators "on the fly" for the designed TTA processors and
+loads them in to the compiler backend as runtime libraries to avoid per-target
+recompilation of larger parts of the compiler chain.</p>
+
+</div>
 
 <!-- *********************************************************************** -->
 <div class="doc_section">
@@ -439,7 +483,7 @@ New llvm/Support/Regex.h API.  FileCheck now does regex's
 Many subtle pointer invalidation bugs in Callgraph have been fixed and it now uses asserting value handles.
 MC Disassembler (with blog post), MCInstPrinter.  Many X86 backend and AsmPrinter simplifications
 Various tools like llc and opt now read either .ll or .bc files as input.
-Malloc and free instructions got removed.
+Malloc and free instructions got removed, along with LowerAllocations pass.
 compiler-rt support for ARM.
 completely llvm-gcc NEON support.
 Can transcode from GAS to intel syntax with "llvm-mc foo.s -output-asm-variant=1"
@@ -456,8 +500,12 @@ x86 sibcall optimization
 New LSR with full strength reduction mode
 The most awesome sext / zext optimization pass. ?
 
+The ARM backend now has good support for ARMv4 backend (tested on StrongARM
+  hardware), previously only supported ARMv4T and newer.
 
 
+
+Defaults to RTTI off, packagers should build with make REQUIRE_RTTI=1.
 CondProp pass removed (functionality merged into jump threading).
 AndersAA got removed (from 2.7 or mainline?)
 PredSimplify, LoopVR, GVNPRE got removed.
@@ -978,20 +1026,6 @@ ignored</a>.</li>
 </ul>
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection">
-	<a name="ocaml-bindings">Known problems with the O'Caml bindings</a>
-</div>
-
-<div class="doc_text">
-
-<p>The <tt>Llvm.Linkage</tt> module is broken, and has incorrect values. Only
-<tt>Llvm.Linkage.External</tt>, <tt>Llvm.Linkage.Available_externally</tt>, and
-<tt>Llvm.Linkage.Link_once</tt> will be correct. If you need any of the other linkage
-modes, you'll have to write an external C library in order to expose the
-functionality. This has been fixed in the trunk.</p>
-</div>
-
 <!-- *********************************************************************** -->
 <div class="doc_section">
   <a name="additionalinfo">Additional Information</a>
@@ -1024,7 +1058,7 @@ lists</a>.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-03-19 04:18:05 +0100 (Fri, 19 Mar 2010) $
+  Last modified: $Date: 2010-04-01 03:53:24 +0200 (Thu, 01 Apr 2010) $
 </address>
 
 </body>
diff --git a/docs/SourceLevelDebugging.html b/docs/SourceLevelDebugging.html
index 7fffece2e0e4..021d7ada158c 100644
--- a/docs/SourceLevelDebugging.html
+++ b/docs/SourceLevelDebugging.html
@@ -289,26 +289,25 @@ height="369">
    0x1000.)</p>
 
 <p>The fields of debug descriptors used internally by LLVM 
-   are restricted to only the simple data types <tt>int</tt>, <tt>uint</tt>,
-   <tt>bool</tt>, <tt>float</tt>, <tt>double</tt>, <tt>mdstring</tt> and
-   <tt>mdnode</tt>. </p>
+   are restricted to only the simple data types <tt>i32</tt>, <tt>i1</tt>,
+   <tt>float</tt>, <tt>double</tt>, <tt>mdstring</tt> and <tt>mdnode</tt>. </p>
 
 <div class="doc_code">
 <pre>
 !1 = metadata !{
-  uint,   ;; A tag
+  i32,   ;; A tag
   ...
 }
 </pre>
 </div>
 
 <p><a name="LLVMDebugVersion">The first field of a descriptor is always an
-   <tt>uint</tt> containing a tag value identifying the content of the
+   <tt>i32</tt> containing a tag value identifying the content of the
    descriptor.  The remaining fields are specific to the descriptor.  The values
    of tags are loosely bound to the tag values of DWARF information entries.
    However, that does not restrict the use of the information supplied to DWARF
    targets.  To facilitate versioning of debug information, the tag is augmented
-   with the current debug version (LLVMDebugVersion = 8 << 16 or 0x80000 or
+   with the current debug version (LLVMDebugVersion = 8 &lt;&lt; 16 or 0x80000 or
    524288.)</a></p>
 
 <p>The details of the various descriptors follow.</p>  
@@ -829,8 +828,8 @@ DW_TAG_return_variable = 258
    rules.</p>
 
 <p>In order to handle this, the LLVM debug format uses the metadata attached to
-   llvm instructions to encode line nuber and scoping information. Consider the
-   following C fragment, for example:</p>
+   llvm instructions to encode line number and scoping information. Consider
+   the following C fragment, for example:</p>
 
 <div class="doc_code">
 <pre>
@@ -1069,6 +1068,18 @@ int main(int argc, char *argv[]) {
 </pre>
 </div>
 
+<p>llvm::Instruction provides easy access to metadata attached with an 
+instruction. One can extract line number information encoded in LLVM IR
+using <tt>Instruction::getMetadata()</tt> and 
+<tt>DILocation::getLineNumber()</tt>.
+<pre>
+ if (MDNode *N = I->getMetadata("dbg")) {  // Here I is an LLVM instruction
+   DILocation Loc(N);                      // DILocation is in DebugInfo.h
+   unsigned Line = Loc.getLineNumber();
+   StringRef File = Loc.getFilename();
+   StringRef Dir = Loc.getDirectory();
+ }
+</pre>
 </div>
 
 <!-- ======================================================================= -->
@@ -1762,7 +1773,7 @@ enum Trees {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-03-17 16:01:50 +0100 (Wed, 17 Mar 2010) $
+  Last modified: $Date: 2010-03-31 09:50:17 +0200 (Wed, 31 Mar 2010) $
 </address>
 
 </body>
diff --git a/docs/TableGenFundamentals.html b/docs/TableGenFundamentals.html
index 701d11fde62b..54e8c26878a3 100644
--- a/docs/TableGenFundamentals.html
+++ b/docs/TableGenFundamentals.html
@@ -768,9 +768,6 @@ patterns:</p>
   <dd>an implicitly defined physical register.  This tells the dag instruction
   selection emitter the input pattern's extra definitions matches implicit
   physical register definitions.</dd>
-<dt><tt>(parallel (a), (b))</tt></dt>
-  <dd>a list of dags specifying parallel operations which map to the same
-  instruction.</dd>
 
 </div>
 
@@ -797,7 +794,7 @@ This should highlight the APIs in <tt>TableGen/Record.h</tt>.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-02-28 00:47:46 +0100 (Sun, 28 Feb 2010) $
+  Last modified: $Date: 2010-03-27 03:53:27 +0100 (Sat, 27 Mar 2010) $
 </address>
 
 </body>
diff --git a/docs/tutorial/OCamlLangImpl6.html b/docs/tutorial/OCamlLangImpl6.html
index de706981b5dd..c10d5bbb65c1 100644
--- a/docs/tutorial/OCamlLangImpl6.html
+++ b/docs/tutorial/OCamlLangImpl6.html
@@ -821,7 +821,7 @@ ocaml_lib ~extern:true "llvm_executionengine";;
 ocaml_lib ~extern:true "llvm_target";;
 ocaml_lib ~extern:true "llvm_scalar_opts";;
 
-flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"]);;
+flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"; A"-cclib"; A"-rdynamic"]);;
 dep ["link"; "ocaml"; "use_bindings"] ["bindings.o"];;
 </pre>
 </dd>
@@ -1568,7 +1568,7 @@ SSA construction</a>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-03-08 20:32:18 +0100 (Mon, 08 Mar 2010) $
+  Last modified: $Date: 2010-03-22 00:15:13 +0100 (Mon, 22 Mar 2010) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl7.html b/docs/tutorial/OCamlLangImpl7.html
index d4b7c9b709fb..c360f2acda3e 100644
--- a/docs/tutorial/OCamlLangImpl7.html
+++ b/docs/tutorial/OCamlLangImpl7.html
@@ -999,7 +999,7 @@ ocaml_lib ~extern:true "llvm_executionengine";;
 ocaml_lib ~extern:true "llvm_target";;
 ocaml_lib ~extern:true "llvm_scalar_opts";;
 
-flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"]);;
+flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"; A"-cclib"; A"-rdynamic"]);;
 dep ["link"; "ocaml"; "use_bindings"] ["bindings.o"];;
 </pre>
 </dd>
@@ -1901,7 +1901,7 @@ extern double printd(double X) {
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  Last modified: $Date: 2010-03-08 20:32:18 +0100 (Mon, 08 Mar 2010) $
+  Last modified: $Date: 2010-03-22 00:15:13 +0100 (Mon, 22 Mar 2010) $
 </address>
 </body>
 </html>
diff --git a/examples/OCaml-Kaleidoscope/Chapter6/myocamlbuild.ml b/examples/OCaml-Kaleidoscope/Chapter6/myocamlbuild.ml
index ff42664c43ba..54d3fd977095 100644
--- a/examples/OCaml-Kaleidoscope/Chapter6/myocamlbuild.ml
+++ b/examples/OCaml-Kaleidoscope/Chapter6/myocamlbuild.ml
@@ -6,5 +6,5 @@ ocaml_lib ~extern:true "llvm_executionengine";;
 ocaml_lib ~extern:true "llvm_target";;
 ocaml_lib ~extern:true "llvm_scalar_opts";;
 
-flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"]);;
+flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"; A"-cclib"; A"-rdynamic"]);;
 dep ["link"; "ocaml"; "use_bindings"] ["bindings.o"];;
diff --git a/examples/OCaml-Kaleidoscope/Chapter7/myocamlbuild.ml b/examples/OCaml-Kaleidoscope/Chapter7/myocamlbuild.ml
index ff42664c43ba..54d3fd977095 100644
--- a/examples/OCaml-Kaleidoscope/Chapter7/myocamlbuild.ml
+++ b/examples/OCaml-Kaleidoscope/Chapter7/myocamlbuild.ml
@@ -6,5 +6,5 @@ ocaml_lib ~extern:true "llvm_executionengine";;
 ocaml_lib ~extern:true "llvm_target";;
 ocaml_lib ~extern:true "llvm_scalar_opts";;
 
-flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"]);;
+flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"; A"-cclib"; A"-rdynamic"]);;
 dep ["link"; "ocaml"; "use_bindings"] ["bindings.o"];;
diff --git a/include/llvm/ADT/PointerUnion.h b/include/llvm/ADT/PointerUnion.h
index 49c894092fa4..3a514b562697 100644
--- a/include/llvm/ADT/PointerUnion.h
+++ b/include/llvm/ADT/PointerUnion.h
@@ -124,7 +124,7 @@ namespace llvm {
     }
     
     void *getOpaqueValue() const { return Val.getOpaqueValue(); }
-    static PointerUnion getFromOpaqueValue(void *VP) {
+    static inline PointerUnion getFromOpaqueValue(void *VP) {
       PointerUnion V;
       V.Val = ValTy::getFromOpaqueValue(VP);
       return V;
@@ -227,7 +227,7 @@ namespace llvm {
     }
     
     void *getOpaqueValue() const { return Val.getOpaqueValue(); }
-    static PointerUnion3 getFromOpaqueValue(void *VP) {
+    static inline PointerUnion3 getFromOpaqueValue(void *VP) {
       PointerUnion3 V;
       V.Val = ValTy::getFromOpaqueValue(VP);
       return V;
@@ -338,7 +338,7 @@ namespace llvm {
     }
     
     void *getOpaqueValue() const { return Val.getOpaqueValue(); }
-    static PointerUnion4 getFromOpaqueValue(void *VP) {
+    static inline PointerUnion4 getFromOpaqueValue(void *VP) {
       PointerUnion4 V;
       V.Val = ValTy::getFromOpaqueValue(VP);
       return V;
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index c2afb7e681d1..2d79a029d019 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -239,11 +239,20 @@ public:
   /// starting with "Dest", constructing elements into it as needed.
   template<typename It1, typename It2>
   static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
-    // Use memcpy for PODs: std::uninitialized_copy optimizes to memmove, memcpy
-    // is better.
-    memcpy(&*Dest, &*I, (E-I)*sizeof(T));
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
   }
-  
+
+  /// uninitialized_copy - Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename T1, typename T2>
+  static void uninitialized_copy(T1 *I, T1 *E, T2 *Dest) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here.
+    memcpy(Dest, I, (E-I)*sizeof(T));
+  }
+
   /// grow - double the size of the allocated memory, guaranteeing space for at
   /// least one more element or MinSize if specified.
   void grow(size_t MinSize = 0) {
@@ -501,10 +510,13 @@ public:
     this->uninitialized_copy(I, OldEnd, this->end()-NumOverwritten);
     
     // Replace the overwritten part.
-    std::copy(From, From+NumOverwritten, I);
+    for (; NumOverwritten > 0; --NumOverwritten) {
+      *I = *From;
+      ++I; ++From;
+    }
     
     // Insert the non-overwritten middle part.
-    this->uninitialized_copy(From+NumOverwritten, To, OldEnd);
+    this->uninitialized_copy(From, To, OldEnd);
     return I;
   }
   
diff --git a/include/llvm/ADT/Statistic.h b/include/llvm/ADT/Statistic.h
index 1a4833cc4a07..c593c58c1274 100644
--- a/include/llvm/ADT/Statistic.h
+++ b/include/llvm/ADT/Statistic.h
@@ -29,6 +29,7 @@
 #include "llvm/System/Atomic.h"
 
 namespace llvm {
+class raw_ostream;
 
 class Statistic {
 public:
@@ -113,6 +114,15 @@ protected:
 #define STATISTIC(VARNAME, DESC) \
   static llvm::Statistic VARNAME = { DEBUG_TYPE, DESC, 0, 0 }
 
+/// \brief Enable the collection and printing of statistics.
+void EnableStatistics();
+
+/// \brief Print statistics to the file returned by CreateInfoOutputFile().
+void PrintStatistics();
+
+/// \brief Print statistics to the given output stream.
+void PrintStatistics(raw_ostream &OS);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 86e8546adcac..482193859b64 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -216,6 +216,14 @@ public:
   static const StringMapEntry &GetStringMapEntryFromValue(const ValueTy &V) {
     return GetStringMapEntryFromValue(const_cast<ValueTy&>(V));
   }
+  
+  /// GetStringMapEntryFromKeyData - Given key data that is known to be embedded
+  /// into a StringMapEntry, return the StringMapEntry itself.
+  static StringMapEntry &GetStringMapEntryFromKeyData(const char *KeyData) {
+    char *Ptr = const_cast<char*>(KeyData) - sizeof(StringMapEntry<ValueTy>);
+    return *reinterpret_cast<StringMapEntry*>(Ptr);
+  }
+  
 
   /// Destroy - Destroy this StringMapEntry, releasing memory back to the
   /// specified allocator.
diff --git a/include/llvm/Analysis/DebugInfo.h b/include/llvm/Analysis/DebugInfo.h
index 9b1d1b306f2b..4e8c4c85bfe4 100644
--- a/include/llvm/Analysis/DebugInfo.h
+++ b/include/llvm/Analysis/DebugInfo.h
@@ -395,8 +395,21 @@ namespace llvm {
     }
     unsigned isArtificial() const    { return getUnsignedField(14); }
 
-    StringRef getFilename() const    { return getCompileUnit().getFilename();}
-    StringRef getDirectory() const   { return getCompileUnit().getDirectory();}
+    StringRef getFilename() const    { 
+      if (getVersion() == llvm::LLVMDebugVersion7)
+        return getCompileUnit().getFilename();
+
+      DIFile F = getFieldAs<DIFile>(6); 
+      return F.getFilename();
+    }
+
+    StringRef getDirectory() const   { 
+      if (getVersion() == llvm::LLVMDebugVersion7)
+        return getCompileUnit().getFilename();
+
+      DIFile F = getFieldAs<DIFile>(6); 
+      return F.getDirectory();
+    }
 
     /// Verify - Verify that a subprogram descriptor is well formed.
     bool Verify() const;
diff --git a/include/llvm/Analysis/Dominators.h b/include/llvm/Analysis/Dominators.h
index 1e94f304c922..f8103103a0e3 100644
--- a/include/llvm/Analysis/Dominators.h
+++ b/include/llvm/Analysis/Dominators.h
@@ -116,12 +116,12 @@ public:
       return true;
 
     SmallPtrSet<NodeT *, 4> OtherChildren;
-    for(iterator I = Other->begin(), E = Other->end(); I != E; ++I) {
+    for (iterator I = Other->begin(), E = Other->end(); I != E; ++I) {
       NodeT *Nd = (*I)->getBlock();
       OtherChildren.insert(Nd);
     }
 
-    for(iterator I = begin(), E = end(); I != E; ++I) {
+    for (iterator I = begin(), E = end(); I != E; ++I) {
       NodeT *N = (*I)->getBlock();
       if (OtherChildren.count(N) == 0)
         return true;
@@ -240,8 +240,9 @@ protected:
   template<class N, class GraphT>
   void Split(DominatorTreeBase<typename GraphT::NodeType>& DT,
              typename GraphT::NodeType* NewBB) {
-    assert(std::distance(GraphT::child_begin(NewBB), GraphT::child_end(NewBB)) == 1
-           && "NewBB should have a single successor!");
+    assert(std::distance(GraphT::child_begin(NewBB),
+                         GraphT::child_end(NewBB)) == 1 &&
+           "NewBB should have a single successor!");
     typename GraphT::NodeType* NewBBSucc = *GraphT::child_begin(NewBB);
 
     std::vector<typename GraphT::NodeType*> PredBlocks;
@@ -374,8 +375,8 @@ public:
   /// isReachableFromEntry - Return true if A is dominated by the entry
   /// block of the function containing it.
   bool isReachableFromEntry(NodeT* A) {
-    assert (!this->isPostDominator()
-            && "This is not implemented for post dominators");
+    assert(!this->isPostDominator() &&
+           "This is not implemented for post dominators");
     return dominates(&A->getParent()->front(), A);
   }
 
@@ -393,8 +394,9 @@ public:
     // Compare the result of the tree walk and the dfs numbers, if expensive
     // checks are enabled.
 #ifdef XDEBUG
-    assert(!DFSInfoValid
-           || (dominatedBySlowTreeWalk(A, B) == B->DominatedBy(A)));
+    assert((!DFSInfoValid ||
+            (dominatedBySlowTreeWalk(A, B) == B->DominatedBy(A))) &&
+           "Tree walk disagrees with dfs numbers!");
 #endif
 
     if (DFSInfoValid)
@@ -430,16 +432,16 @@ public:
   /// findNearestCommonDominator - Find nearest common dominator basic block
   /// for basic block A and B. If there is no such block then return NULL.
   NodeT *findNearestCommonDominator(NodeT *A, NodeT *B) {
+    assert(A->getParent() == B->getParent() &&
+           "Two blocks are not in same function");
 
-    assert (!this->isPostDominator()
-            && "This is not implemented for post dominators");
-    assert (A->getParent() == B->getParent()
-            && "Two blocks are not in same function");
-
-    // If either A or B is a entry block then it is nearest common dominator.
-    NodeT &Entry  = A->getParent()->front();
-    if (A == &Entry || B == &Entry)
-      return &Entry;
+    // If either A or B is a entry block then it is nearest common dominator
+    // (for forward-dominators).
+    if (!this->isPostDominator()) {
+      NodeT &Entry = A->getParent()->front();
+      if (A == &Entry || B == &Entry)
+        return &Entry;
+    }
 
     // If B dominates A then B is nearest common dominator.
     if (dominates(B, A))
@@ -463,7 +465,7 @@ public:
 
     // Walk NodeB immediate dominators chain and find common dominator node.
     DomTreeNodeBase<NodeT> *IDomB = NodeB->getIDom();
-    while(IDomB) {
+    while (IDomB) {
       if (NodeADoms.count(IDomB) != 0)
         return IDomB->getBlock();
 
@@ -508,8 +510,8 @@ public:
   /// children list. Deletes dominator node associated with basic block BB.
   void eraseNode(NodeT *BB) {
     DomTreeNodeBase<NodeT> *Node = getNode(BB);
-    assert (Node && "Removing node that isn't in dominator tree.");
-    assert (Node->getChildren().empty() && "Node is not a leaf node.");
+    assert(Node && "Removing node that isn't in dominator tree.");
+    assert(Node->getChildren().empty() && "Node is not a leaf node.");
 
       // Remove node from immediate dominator's children list.
     DomTreeNodeBase<NodeT> *IDom = Node->getIDom();
@@ -952,7 +954,7 @@ public:
         return true;
     }
 
-    if(!tmpSet.empty())
+    if (!tmpSet.empty())
       // There are nodes that are in DS2 but not in DS1.
       return true;
 
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 2cd477e8065c..ffeb44da7bc8 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -47,7 +47,6 @@ namespace llvm {
   class MCSection;
   class MCStreamer;
   class MCSymbol;
-  class MDNode;
   class DwarfWriter;
   class Mangler;
   class MCAsmInfo;
@@ -138,9 +137,6 @@ namespace llvm {
     mutable unsigned Counter;
     mutable unsigned SetCounter;
     
-    // Private state for processDebugLoc()
-    mutable const MDNode *PrevDLT;
-
   protected:
     explicit AsmPrinter(formatted_raw_ostream &o, TargetMachine &TM,
                         MCStreamer &Streamer);
diff --git a/include/llvm/CodeGen/DwarfWriter.h b/include/llvm/CodeGen/DwarfWriter.h
index 3c7f8024d9cd..494400ee2c98 100644
--- a/include/llvm/CodeGen/DwarfWriter.h
+++ b/include/llvm/CodeGen/DwarfWriter.h
@@ -83,19 +83,11 @@ public:
   ///
   void EndFunction(const MachineFunction *MF);
 
-  /// RecordSourceLine - Register a source line with debug info. Returns the
-  /// unique label that was emitted and which provides correspondence to
-  /// the source line list.
-  MCSymbol *RecordSourceLine(unsigned Line, unsigned Col, MDNode *Scope);
-
-  /// getRecordSourceLineCount - Count source lines.
-  unsigned getRecordSourceLineCount();
-
   /// ShouldEmitDwarfDebug - Returns true if Dwarf debugging declarations should
   /// be emitted.
   bool ShouldEmitDwarfDebug() const;
 
-  void BeginScope(const MachineInstr *MI, MCSymbol *Label);
+  void BeginScope(const MachineInstr *MI);
   void EndScope(const MachineInstr *MI);
 };
 
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index eb5901cf2f17..70e79cebcb1b 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -67,7 +67,7 @@ namespace llvm {
     } cr;
 
   public:
-
+    typedef SpecificBumpPtrAllocator<VNInfo> Allocator;
     typedef SmallVector<SlotIndex, 4> KillSet;
 
     /// The ID number of this value.
@@ -330,12 +330,7 @@ namespace llvm {
     }
     
     void clear() {
-      while (!valnos.empty()) {
-        VNInfo *VNI = valnos.back();
-        valnos.pop_back();
-        VNI->~VNInfo();
-      }
-      
+      valnos.clear();
       ranges.clear();
     }
 
@@ -370,10 +365,8 @@ namespace llvm {
     /// getNextValue - Create a new value number and return it.  MIIdx specifies
     /// the instruction that defines the value number.
     VNInfo *getNextValue(SlotIndex def, MachineInstr *CopyMI,
-                         bool isDefAccurate, BumpPtrAllocator &VNInfoAllocator){
-      VNInfo *VNI =
-        static_cast<VNInfo*>(VNInfoAllocator.Allocate((unsigned)sizeof(VNInfo),
-                                                      alignof<VNInfo>()));
+                       bool isDefAccurate, VNInfo::Allocator &VNInfoAllocator) {
+      VNInfo *VNI = VNInfoAllocator.Allocate();
       new (VNI) VNInfo((unsigned)valnos.size(), def, CopyMI);
       VNI->setIsDefAccurate(isDefAccurate);
       valnos.push_back(VNI);
@@ -383,11 +376,8 @@ namespace llvm {
     /// Create a copy of the given value. The new value will be identical except
     /// for the Value number.
     VNInfo *createValueCopy(const VNInfo *orig,
-                            BumpPtrAllocator &VNInfoAllocator) {
-      VNInfo *VNI =
-        static_cast<VNInfo*>(VNInfoAllocator.Allocate((unsigned)sizeof(VNInfo),
-                                                      alignof<VNInfo>()));
-    
+                            VNInfo::Allocator &VNInfoAllocator) {
+      VNInfo *VNI = VNInfoAllocator.Allocate();
       new (VNI) VNInfo((unsigned)valnos.size(), *orig);
       valnos.push_back(VNI);
       return VNI;
@@ -427,14 +417,14 @@ namespace llvm {
     /// VNInfoAllocator since it will create a new val#.
     void MergeInClobberRanges(LiveIntervals &li_,
                               const LiveInterval &Clobbers,
-                              BumpPtrAllocator &VNInfoAllocator);
+                              VNInfo::Allocator &VNInfoAllocator);
 
     /// MergeInClobberRange - Same as MergeInClobberRanges except it merge in a
     /// single LiveRange only.
     void MergeInClobberRange(LiveIntervals &li_,
                              SlotIndex Start,
                              SlotIndex End,
-                             BumpPtrAllocator &VNInfoAllocator);
+                             VNInfo::Allocator &VNInfoAllocator);
 
     /// MergeValueInAsValue - Merge all of the live ranges of a specific val#
     /// in RHS into this live interval as the specified value number.
@@ -454,7 +444,7 @@ namespace llvm {
     /// Copy - Copy the specified live interval. This copies all the fields
     /// except for the register of the interval.
     void Copy(const LiveInterval &RHS, MachineRegisterInfo *MRI,
-              BumpPtrAllocator &VNInfoAllocator);
+              VNInfo::Allocator &VNInfoAllocator);
     
     bool empty() const { return ranges.empty(); }
 
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index 1a2cc25ba490..8ddcac705990 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -55,7 +55,7 @@ namespace llvm {
 
     /// Special pool allocator for VNInfo's (LiveInterval val#).
     ///
-    BumpPtrAllocator VNInfoAllocator;
+    VNInfo::Allocator VNInfoAllocator;
 
     typedef DenseMap<unsigned, LiveInterval*> Reg2IntervalMap;
     Reg2IntervalMap r2iMap_;
@@ -221,7 +221,7 @@ namespace llvm {
       indexes_->renumberIndexes();
     }
 
-    BumpPtrAllocator& getVNInfoAllocator() { return VNInfoAllocator; }
+    VNInfo::Allocator& getVNInfoAllocator() { return VNInfoAllocator; }
 
     /// getVNInfoSourceReg - Helper function that parses the specified VNInfo
     /// copy field and returns the source register that defines it.
diff --git a/include/llvm/CodeGen/LiveStackAnalysis.h b/include/llvm/CodeGen/LiveStackAnalysis.h
index e01d1aea7e83..c6af6a1f89ce 100644
--- a/include/llvm/CodeGen/LiveStackAnalysis.h
+++ b/include/llvm/CodeGen/LiveStackAnalysis.h
@@ -27,7 +27,7 @@ namespace llvm {
   class LiveStacks : public MachineFunctionPass {
     /// Special pool allocator for VNInfo's (LiveInterval val#).
     ///
-    BumpPtrAllocator VNInfoAllocator;
+    VNInfo::Allocator VNInfoAllocator;
 
     /// S2IMap - Stack slot indices to live interval mapping.
     ///
@@ -91,7 +91,7 @@ namespace llvm {
       return I->second;
     }
 
-    BumpPtrAllocator& getVNInfoAllocator() { return VNInfoAllocator; }
+    VNInfo::Allocator& getVNInfoAllocator() { return VNInfoAllocator; }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
     virtual void releaseMemory();
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index e5229479f1e2..b5f6bcd9e7ec 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -285,6 +285,11 @@ public:
     IsEarlyClobber = Val;
   }
 
+  void setIsDebug(bool Val = true) {
+    assert(isReg() && IsDef && "Wrong MachineOperand accessor");
+    IsDebug = Val;
+  }
+
   //===--------------------------------------------------------------------===//
   // Accessors for various operand types.
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/RuntimeLibcalls.h b/include/llvm/CodeGen/RuntimeLibcalls.h
index 4ac316084c52..42ae5635f527 100644
--- a/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -169,6 +169,8 @@ namespace RTLIB {
     FPTOSINT_F32_I32,
     FPTOSINT_F32_I64,
     FPTOSINT_F32_I128,
+    FPTOSINT_F64_I8,
+    FPTOSINT_F64_I16,
     FPTOSINT_F64_I32,
     FPTOSINT_F64_I64,
     FPTOSINT_F64_I128,
@@ -183,6 +185,8 @@ namespace RTLIB {
     FPTOUINT_F32_I32,
     FPTOUINT_F32_I64,
     FPTOUINT_F32_I128,
+    FPTOUINT_F64_I8,
+    FPTOUINT_F64_I16,
     FPTOUINT_F64_I32,
     FPTOUINT_F64_I64,
     FPTOUINT_F64_I128,
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index c8d29aa0c7d1..610edb6fcfb1 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -34,6 +34,7 @@ class FunctionLoweringInfo;
 class MachineConstantPoolValue;
 class MachineFunction;
 class MachineModuleInfo;
+class MDNode;
 class SDNodeOrdering;
 class SDDbgValue;
 class TargetLowering;
@@ -60,42 +61,40 @@ private:
 
 /// SDDbgInfo - Keeps track of dbg_value information through SDISel.  We do
 /// not build SDNodes for these so as not to perturb the generated code;
-/// instead the info is kept off to the side in this structure.  SDNodes may
-/// have an associated dbg_value entry in DbgValMap.  Debug info that is not
-/// associated with any SDNode is held in DbgConstMap.  It is possible for
-/// optimizations to change a variable to a constant, in which case the
-/// corresponding debug info is moved from the variable to the constant table
-/// (NYI).
+/// instead the info is kept off to the side in this structure. Each SDNode may
+/// have one or more associated dbg_value entries. This information is kept in
+/// DbgValMap.
 class SDDbgInfo {
-  DenseMap<const SDNode*, SDDbgValue*> DbgVblMap;
-  SmallVector<SDDbgValue*, 4> DbgConstMap;
+  SmallVector<SDDbgValue*, 32> DbgValues;
+  DenseMap<const SDNode*, SmallVector<SDDbgValue*, 2> > DbgValMap;
 
   void operator=(const SDDbgInfo&);   // Do not implement.
   SDDbgInfo(const SDDbgInfo&);   // Do not implement.
 public:
   SDDbgInfo() {}
 
-  void add(const SDNode *Node, SDDbgValue *V) {
-    DbgVblMap[Node] = V;
+  void add(SDDbgValue *V, const SDNode *Node = 0) {
+    if (Node)
+      DbgValMap[Node].push_back(V);
+    DbgValues.push_back(V);
   }
-  void add(SDDbgValue *V) { DbgConstMap.push_back(V); }
-  void remove(const SDNode *Node) {
-    DenseMap<const SDNode*, SDDbgValue*>::iterator Itr =
-                      DbgVblMap.find(Node);
-    if (Itr != DbgVblMap.end())
-      DbgVblMap.erase(Itr);
-  }
-  // No need to remove a constant.
+
   void clear() {
-    DbgVblMap.clear();
-    DbgConstMap.clear();
+    DbgValMap.clear();
+    DbgValues.clear();
   }
-  SDDbgValue *getSDDbgValue(const SDNode *Node) {
-    return DbgVblMap[Node];
+
+  bool empty() const {
+    return DbgValues.empty();
   }
-  typedef SmallVector<SDDbgValue*, 4>::iterator ConstDbgIterator;
-  ConstDbgIterator DbgConstBegin() { return DbgConstMap.begin(); }
-  ConstDbgIterator DbgConstEnd() { return DbgConstMap.end(); }
+
+  SmallVector<SDDbgValue*,2> &getSDDbgValues(const SDNode *Node) {
+    return DbgValMap[Node];
+  }
+
+  typedef SmallVector<SDDbgValue*,32>::iterator DbgIterator;
+  DbgIterator DbgBegin() { return DbgValues.begin(); }
+  DbgIterator DbgEnd()   { return DbgValues.end(); }
 };
 
 enum CombineLevel {
@@ -769,6 +768,15 @@ public:
   SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs,
                           const SDValue *Ops, unsigned NumOps);
 
+  /// getDbgValue - Creates a SDDbgValue node.
+  ///
+  SDDbgValue *getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R, uint64_t Off,
+                          DebugLoc DL, unsigned O);
+  SDDbgValue *getDbgValue(MDNode *MDPtr, Value *C, uint64_t Off,
+                          DebugLoc DL, unsigned O);
+  SDDbgValue *getDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
+                          DebugLoc DL, unsigned O);
+
   /// DAGUpdateListener - Clients of various APIs that cause global effects on
   /// the DAG can optionally implement this interface.  This allows the clients
   /// to handle the various sorts of updates that happen.
@@ -871,19 +879,21 @@ public:
   /// GetOrdering - Get the order for the SDNode.
   unsigned GetOrdering(const SDNode *SD) const;
 
-  /// AssignDbgInfo - Assign debug info to the SDNode.
-  void AssignDbgInfo(SDNode *SD, SDDbgValue *db);
+  /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
+  /// value is produced by SD.
+  void AddDbgValue(SDDbgValue *DB, SDNode *SD = 0);
 
-  /// RememberDbgInfo - Remember debug info with no associated SDNode.
-  void RememberDbgInfo(SDDbgValue *db);
-
-  /// GetDbgInfo - Get the debug info for the SDNode.
-  SDDbgValue *GetDbgInfo(const SDNode* SD);
-
-  SDDbgInfo::ConstDbgIterator DbgConstBegin() { 
-    return DbgInfo->DbgConstBegin(); 
+  /// GetDbgValues - Get the debug values which reference the given SDNode.
+  SmallVector<SDDbgValue*,2> &GetDbgValues(const SDNode* SD) {
+    return DbgInfo->getSDDbgValues(SD);
   }
-  SDDbgInfo::ConstDbgIterator DbgConstEnd() { return DbgInfo->DbgConstEnd(); }
+
+  /// hasDebugValues - Return true if there are any SDDbgValue nodes associated
+  /// with this SelectionDAG.
+  bool hasDebugValues() const { return !DbgInfo->empty(); }
+
+  SDDbgInfo::DbgIterator DbgBegin() { return DbgInfo->DbgBegin(); }
+  SDDbgInfo::DbgIterator DbgEnd()   { return DbgInfo->DbgEnd(); }
 
   void dump() const;
 
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index a1576be90b78..def09c76fcbe 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -136,6 +136,8 @@ public:
     OPC_EmitRegister,
     OPC_EmitConvertToTarget,
     OPC_EmitMergeInputChains,
+    OPC_EmitMergeInputChains1_0,
+    OPC_EmitMergeInputChains1_1,
     OPC_EmitCopyToReg,
     OPC_EmitNodeXForm,
     OPC_EmitNode,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index c16a48aea2a9..4dcf0135a474 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -616,7 +616,7 @@ namespace ISD {
   /// which do not reference a specific memory location should be less than
   /// this value. Those that do must not be less than this value, and can
   /// be used with SelectionDAG::getMemIntrinsicNode.
-  static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+80;
+  static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+100;
 
   /// Node predicates
 
diff --git a/include/llvm/Function.h b/include/llvm/Function.h
index 38822332c433..2b19fa5a7f38 100644
--- a/include/llvm/Function.h
+++ b/include/llvm/Function.h
@@ -409,8 +409,11 @@ public:
   void dropAllReferences();
 
   /// hasAddressTaken - returns true if there are any uses of this function
-  /// other than direct calls or invokes to it.
-  bool hasAddressTaken() const;
+  /// other than direct calls or invokes to it. Optionally passes back the
+  /// offending user for diagnostic purposes.
+  ///
+  bool hasAddressTaken(const User** = 0) const;
+
 private:
   // Shadow Value::setValueSubclassData with a private forwarding method so that
   // subclasses cannot accidentally use it.
diff --git a/include/llvm/InlineAsm.h b/include/llvm/InlineAsm.h
index 4490ce5a546c..2ac0fcab7960 100644
--- a/include/llvm/InlineAsm.h
+++ b/include/llvm/InlineAsm.h
@@ -24,8 +24,17 @@ namespace llvm {
 class PointerType;
 class FunctionType;
 class Module;
+struct InlineAsmKeyType;
+template<class ValType, class TypeClass, class ConstantClass, bool HasLargeKey>
+class ConstantUniqueMap;
+template<class ConstantClass, class TypeClass, class ValType>
+struct ConstantCreator;
 
 class InlineAsm : public Value {
+  friend struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType>;
+  friend class ConstantUniqueMap<InlineAsmKeyType, PointerType, InlineAsm,
+                                 false>;
+
   InlineAsm(const InlineAsm &);             // do not implement
   void operator=(const InlineAsm&);         // do not implement
 
@@ -33,10 +42,14 @@ class InlineAsm : public Value {
   bool HasSideEffects;
   bool IsAlignStack;
   
-  InlineAsm(const FunctionType *Ty, StringRef AsmString,
-            StringRef Constraints, bool hasSideEffects,
-            bool isAlignStack = false);
+  InlineAsm(const PointerType *Ty, const std::string &AsmString,
+            const std::string &Constraints, bool hasSideEffects,
+            bool isAlignStack);
   virtual ~InlineAsm();
+
+  /// When the ConstantUniqueMap merges two types and makes two InlineAsms
+  /// identical, it destroys one of them with this method.
+  void destroyConstant();
 public:
 
   /// InlineAsm::get - Return the specified uniqued inline asm string.
diff --git a/include/llvm/Instruction.h b/include/llvm/Instruction.h
index cf9dc4456fa4..13331e60ed2f 100644
--- a/include/llvm/Instruction.h
+++ b/include/llvm/Instruction.h
@@ -17,6 +17,7 @@
 
 #include "llvm/User.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/Support/DebugLoc.h"
 
 namespace llvm {
 
@@ -31,6 +32,7 @@ class Instruction : public User, public ilist_node<Instruction> {
   Instruction(const Instruction &);        // Do not implement
 
   BasicBlock *Parent;
+  NewDebugLoc DbgLoc;                      // 'dbg' Metadata cache.
   
   enum {
     /// HasMetadataBit - This is a bit stored in the SubClassData field which
@@ -123,7 +125,13 @@ public:
   /// hasMetadata() - Return true if this instruction has any metadata attached
   /// to it.
   bool hasMetadata() const {
-    return (getSubclassDataFromValue() & HasMetadataBit) != 0;
+    return !DbgLoc.isUnknown() || hasMetadataHashEntry();
+  }
+  
+  /// hasMetadataOtherThanDebugLoc - Return true if this instruction has
+  /// metadata attached to it other than a debug location.
+  bool hasMetadataOtherThanDebugLoc() const {
+    return hasMetadataHashEntry();
   }
   
   /// getMetadata - Get the metadata of given kind attached to this Instruction.
@@ -148,17 +156,49 @@ public:
       getAllMetadataImpl(MDs);
   }
   
+  /// getAllMetadataOtherThanDebugLoc - This does the same thing as
+  /// getAllMetadata, except that it filters out the debug location.
+  void getAllMetadataOtherThanDebugLoc(SmallVectorImpl<std::pair<unsigned,
+                                       MDNode*> > &MDs) const {
+    if (hasMetadataOtherThanDebugLoc())
+      getAllMetadataOtherThanDebugLocImpl(MDs);
+  }
+  
   /// setMetadata - Set the metadata of the specified kind to the specified
   /// node.  This updates/replaces metadata if already present, or removes it if
   /// Node is null.
   void setMetadata(unsigned KindID, MDNode *Node);
   void setMetadata(const char *Kind, MDNode *Node);
 
+  /// setDbgMetadata - This is just an optimized helper function that is
+  /// equivalent to setMetadata("dbg", Node);
+  void setDbgMetadata(MDNode *Node);
+  
+  /// getDbgMetadata - This is just an optimized helper function that is
+  /// equivalent to calling getMetadata("dbg").
+  MDNode *getDbgMetadata() const {
+    return DbgLoc.getAsMDNode(getContext());
+  }
+
+  /// setDebugLoc - Set the debug location information for this instruction.
+  void setDebugLoc(const NewDebugLoc &Loc) { DbgLoc = Loc; }
+  
+  /// getDebugLoc - Return the debug location for this node as a DebugLoc.
+  const NewDebugLoc &getDebugLoc() const { return DbgLoc; }
+  
 private:
+  /// hasMetadataHashEntry - Return true if we have an entry in the on-the-side
+  /// metadata hash.
+  bool hasMetadataHashEntry() const {
+    return (getSubclassDataFromValue() & HasMetadataBit) != 0;
+  }
+  
   // These are all implemented in Metadata.cpp.
   MDNode *getMetadataImpl(unsigned KindID) const;
   MDNode *getMetadataImpl(const char *Kind) const;
   void getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned,MDNode*> > &)const;
+  void getAllMetadataOtherThanDebugLocImpl(SmallVectorImpl<std::pair<unsigned,
+                                           MDNode*> > &) const;
   void removeAllMetadata();
 public:
   //===--------------------------------------------------------------------===//
@@ -315,7 +355,7 @@ private:
     return Value::getSubclassDataFromValue();
   }
   
-  void setHasMetadata(bool V) {
+  void setHasMetadataHashEntry(bool V) {
     setValueSubclassData((getSubclassDataFromValue() & ~HasMetadataBit) |
                          (V ? HasMetadataBit : 0));
   }
diff --git a/include/llvm/Instructions.h b/include/llvm/Instructions.h
index b1f199604589..413a595ab38a 100644
--- a/include/llvm/Instructions.h
+++ b/include/llvm/Instructions.h
@@ -971,6 +971,13 @@ public:
   unsigned getParamAlignment(unsigned i) const {
     return AttributeList.getParamAlignment(i);
   }
+  
+  /// @brief Return true if the call should not be inlined.
+  bool isNoInline() const { return paramHasAttr(~0, Attribute::NoInline); }
+  void setIsNoInline(bool Value) {
+    if (Value) addAttribute(~0, Attribute::NoInline);
+    else removeAttribute(~0, Attribute::NoInline);
+  }
 
   /// @brief Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
@@ -2456,6 +2463,13 @@ public:
     return AttributeList.getParamAlignment(i);
   }
 
+  /// @brief Return true if the call should not be inlined.
+  bool isNoInline() const { return paramHasAttr(~0, Attribute::NoInline); }
+  void setIsNoInline(bool Value) {
+    if (Value) addAttribute(~0, Attribute::NoInline);
+    else removeAttribute(~0, Attribute::NoInline);
+  }
+  
   /// @brief Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
     return paramHasAttr(~0, Attribute::ReadNone);
@@ -2508,32 +2522,31 @@ public:
   /// indirect function invocation.
   ///
   Function *getCalledFunction() const {
-    return dyn_cast<Function>(getOperand(0));
+    return dyn_cast<Function>(Op<-3>());
   }
 
   /// getCalledValue - Get a pointer to the function that is invoked by this
   /// instruction
-  const Value *getCalledValue() const { return getOperand(0); }
-        Value *getCalledValue()       { return getOperand(0); }
+  const Value *getCalledValue() const { return Op<-3>(); }
+        Value *getCalledValue()       { return Op<-3>(); }
 
   /// setCalledFunction - Set the function called.
   void setCalledFunction(Value* Fn) {
-    Op<0>() = Fn;
+    Op<-3>() = Fn;
   }
 
   // get*Dest - Return the destination basic blocks...
   BasicBlock *getNormalDest() const {
-    return cast<BasicBlock>(getOperand(1));
+    return cast<BasicBlock>(Op<-2>());
   }
   BasicBlock *getUnwindDest() const {
-    return cast<BasicBlock>(getOperand(2));
+    return cast<BasicBlock>(Op<-1>());
   }
   void setNormalDest(BasicBlock *B) {
-    setOperand(1, (Value*)B);
+    Op<-2>() = reinterpret_cast<Value*>(B);
   }
-
   void setUnwindDest(BasicBlock *B) {
-    setOperand(2, (Value*)B);
+    Op<-1>() = reinterpret_cast<Value*>(B);
   }
 
   BasicBlock *getSuccessor(unsigned i) const {
@@ -2543,7 +2556,7 @@ public:
 
   void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
     assert(idx < 2 && "Successor # out of range for invoke!");
-    setOperand(idx+1, (Value*)NewSucc);
+    *(&Op<-2>() + idx) = reinterpret_cast<Value*>(NewSucc);
   }
 
   unsigned getNumSuccessors() const { return 2; }
@@ -2556,6 +2569,7 @@ public:
   static inline bool classof(const Value *V) {
     return isa<Instruction>(V) && classof(cast<Instruction>(V));
   }
+
 private:
   virtual BasicBlock *getSuccessorV(unsigned idx) const;
   virtual unsigned getNumSuccessorsV() const;
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index 54c7b1f102b3..d66e80fb22fb 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -176,19 +176,19 @@ class GCCBuiltin<string name> {
 //===--------------- Variable Argument Handling Intrinsics ----------------===//
 //
 
-def int_vastart : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [], "llvm.va_start">;
-def int_vacopy  : Intrinsic<[llvm_void_ty], [llvm_ptr_ty, llvm_ptr_ty], [],
+def int_vastart : Intrinsic<[], [llvm_ptr_ty], [], "llvm.va_start">;
+def int_vacopy  : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [],
                             "llvm.va_copy">;
-def int_vaend   : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [], "llvm.va_end">;
+def int_vaend   : Intrinsic<[], [llvm_ptr_ty], [], "llvm.va_end">;
 
 //===------------------- Garbage Collection Intrinsics --------------------===//
 //
-def int_gcroot  : Intrinsic<[llvm_void_ty],
+def int_gcroot  : Intrinsic<[],
                             [llvm_ptrptr_ty, llvm_ptr_ty]>;
 def int_gcread  : Intrinsic<[llvm_ptr_ty],
                             [llvm_ptr_ty, llvm_ptrptr_ty],
                             [IntrReadArgMem]>;
-def int_gcwrite : Intrinsic<[llvm_void_ty],
+def int_gcwrite : Intrinsic<[],
                             [llvm_ptr_ty, llvm_ptr_ty, llvm_ptrptr_ty],
                             [IntrWriteArgMem, NoCapture<1>, NoCapture<2>]>;
 
@@ -201,37 +201,37 @@ def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 // model their dependencies on allocas.
 def int_stacksave     : Intrinsic<[llvm_ptr_ty]>,
                         GCCBuiltin<"__builtin_stack_save">;
-def int_stackrestore  : Intrinsic<[llvm_void_ty], [llvm_ptr_ty]>,
+def int_stackrestore  : Intrinsic<[], [llvm_ptr_ty]>,
                         GCCBuiltin<"__builtin_stack_restore">;
 
 // IntrWriteArgMem is more pessimistic than strictly necessary for prefetch,
 // however it does conveniently prevent the prefetch from being reordered
 // with respect to nearby accesses to the same memory.
-def int_prefetch      : Intrinsic<[llvm_void_ty],
+def int_prefetch      : Intrinsic<[],
                                   [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                                   [IntrWriteArgMem, NoCapture<0>]>;
-def int_pcmarker      : Intrinsic<[llvm_void_ty], [llvm_i32_ty]>;
+def int_pcmarker      : Intrinsic<[], [llvm_i32_ty]>;
 
 def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
 
 // Stack Protector Intrinsic - The stackprotector intrinsic writes the stack
 // guard to the correct place on the stack frame.
-def int_stackprotector : Intrinsic<[llvm_void_ty],
+def int_stackprotector : Intrinsic<[],
                                    [llvm_ptr_ty, llvm_ptrptr_ty],
                                    [IntrWriteMem]>;
 
 //===------------------- Standard C Library Intrinsics --------------------===//
 //
 
-def int_memcpy  : Intrinsic<[llvm_void_ty],
+def int_memcpy  : Intrinsic<[],
                              [llvm_ptr_ty, llvm_ptr_ty, llvm_anyint_ty,
                               llvm_i32_ty],
                             [IntrWriteArgMem, NoCapture<0>, NoCapture<1>]>;
-def int_memmove : Intrinsic<[llvm_void_ty],
+def int_memmove : Intrinsic<[],
                             [llvm_ptr_ty, llvm_ptr_ty, llvm_anyint_ty,
                              llvm_i32_ty],
                             [IntrWriteArgMem, NoCapture<0>, NoCapture<1>]>;
-def int_memset  : Intrinsic<[llvm_void_ty],
+def int_memset  : Intrinsic<[],
                             [llvm_ptr_ty, llvm_i8_ty, llvm_anyint_ty,
                              llvm_i32_ty],
                             [IntrWriteArgMem, NoCapture<0>]>;
@@ -255,9 +255,9 @@ let Properties = [IntrReadMem] in {
 
 // NOTE: these are internal interfaces.
 def int_setjmp     : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
-def int_longjmp    : Intrinsic<[llvm_void_ty], [llvm_ptr_ty, llvm_i32_ty]>;
+def int_longjmp    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
 def int_sigsetjmp  : Intrinsic<[llvm_i32_ty] , [llvm_ptr_ty, llvm_i32_ty]>;
-def int_siglongjmp : Intrinsic<[llvm_void_ty], [llvm_ptr_ty, llvm_i32_ty]>;
+def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
 
 // Internal interface for object size checking
 def int_objectsize : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i1_ty],
@@ -282,9 +282,9 @@ let Properties = [IntrNoMem] in {
 // optimizers can change them aggressively.  Special handling needed in a few
 // places.
 let Properties = [IntrNoMem] in {
-  def int_dbg_declare      : Intrinsic<[llvm_void_ty],
+  def int_dbg_declare      : Intrinsic<[],
                                        [llvm_metadata_ty, llvm_metadata_ty]>;
-  def int_dbg_value  	   : Intrinsic<[llvm_void_ty],
+  def int_dbg_value  	   : Intrinsic<[],
                                        [llvm_metadata_ty, llvm_i64_ty,
                                         llvm_metadata_ty]>;
 }
@@ -297,24 +297,24 @@ def int_eh_selector  : Intrinsic<[llvm_i32_ty],
 
 def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
 
-def int_eh_return_i32 : Intrinsic<[llvm_void_ty], [llvm_i32_ty, llvm_ptr_ty]>;
-def int_eh_return_i64 : Intrinsic<[llvm_void_ty], [llvm_i64_ty, llvm_ptr_ty]>;
+def int_eh_return_i32 : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>;
+def int_eh_return_i64 : Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty]>;
 
-def int_eh_unwind_init: Intrinsic<[llvm_void_ty]>,
+def int_eh_unwind_init: Intrinsic<[]>,
                         GCCBuiltin<"__builtin_unwind_init">;
 
 def int_eh_dwarf_cfa  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty]>;
 
 let Properties = [IntrNoMem] in {
   def int_eh_sjlj_setjmp  : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
-  def int_eh_sjlj_longjmp : Intrinsic<[llvm_void_ty], [llvm_ptr_ty]>;
+  def int_eh_sjlj_longjmp : Intrinsic<[], [llvm_ptr_ty]>;
   def int_eh_sjlj_lsda    : Intrinsic<[llvm_ptr_ty]>;
-  def int_eh_sjlj_callsite: Intrinsic<[llvm_void_ty], [llvm_i32_ty]>;
+  def int_eh_sjlj_callsite: Intrinsic<[], [llvm_i32_ty]>;
 }
 
 //===---------------- Generic Variable Attribute Intrinsics----------------===//
 //
-def int_var_annotation : Intrinsic<[llvm_void_ty],
+def int_var_annotation : Intrinsic<[],
                                    [llvm_ptr_ty, llvm_ptr_ty,
                                     llvm_ptr_ty, llvm_i32_ty],
                                    [], "llvm.var.annotation">;
@@ -361,7 +361,7 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
 
 //===------------------------- Atomic Intrinsics --------------------------===//
 //
-def int_memory_barrier : Intrinsic<[llvm_void_ty],
+def int_memory_barrier : Intrinsic<[],
                                    [llvm_i1_ty, llvm_i1_ty,
                                     llvm_i1_ty, llvm_i1_ty, llvm_i1_ty], []>,
                                     GCCBuiltin<"__builtin_llvm_memory_barrier">;
@@ -429,16 +429,16 @@ def int_atomic_load_umax : Intrinsic<[llvm_anyint_ty],
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
-def int_lifetime_start  : Intrinsic<[llvm_void_ty],
+def int_lifetime_start  : Intrinsic<[],
                                     [llvm_i64_ty, llvm_ptr_ty],
                                     [IntrWriteArgMem, NoCapture<1>]>;
-def int_lifetime_end    : Intrinsic<[llvm_void_ty],
+def int_lifetime_end    : Intrinsic<[],
                                     [llvm_i64_ty, llvm_ptr_ty],
                                     [IntrWriteArgMem, NoCapture<1>]>;
 def int_invariant_start : Intrinsic<[llvm_descriptor_ty],
                                     [llvm_i64_ty, llvm_ptr_ty],
                                     [IntrReadArgMem, NoCapture<1>]>;
-def int_invariant_end   : Intrinsic<[llvm_void_ty],
+def int_invariant_end   : Intrinsic<[],
                                     [llvm_descriptor_ty, llvm_i64_ty,
                                      llvm_ptr_ty],
                                     [IntrWriteArgMem, NoCapture<2>]>;
@@ -447,7 +447,7 @@ def int_invariant_end   : Intrinsic<[llvm_void_ty],
 //
 def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
                      GCCBuiltin<"__builtin_flt_rounds">;
-def int_trap : Intrinsic<[llvm_void_ty]>,
+def int_trap : Intrinsic<[]>,
                GCCBuiltin<"__builtin_trap">;
 
 // Intrisics to support half precision floating point format
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td
index c408a2f374ec..40333caead7b 100644
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@@ -344,31 +344,31 @@ let TargetPrefix = "arm" in {
                                         [IntrReadArgMem]>;
 
   // Interleaving vector stores from N-element structures.
-  def int_arm_neon_vst1 : Intrinsic<[llvm_void_ty],
+  def int_arm_neon_vst1 : Intrinsic<[],
                                     [llvm_ptr_ty, llvm_anyvector_ty],
                                     [IntrWriteArgMem]>;
-  def int_arm_neon_vst2 : Intrinsic<[llvm_void_ty],
+  def int_arm_neon_vst2 : Intrinsic<[],
                                     [llvm_ptr_ty, llvm_anyvector_ty,
                                      LLVMMatchType<0>], [IntrWriteArgMem]>;
-  def int_arm_neon_vst3 : Intrinsic<[llvm_void_ty],
+  def int_arm_neon_vst3 : Intrinsic<[],
                                     [llvm_ptr_ty, llvm_anyvector_ty,
                                      LLVMMatchType<0>, LLVMMatchType<0>],
                                      [IntrWriteArgMem]>;
-  def int_arm_neon_vst4 : Intrinsic<[llvm_void_ty],
+  def int_arm_neon_vst4 : Intrinsic<[],
                                     [llvm_ptr_ty, llvm_anyvector_ty,
                                      LLVMMatchType<0>, LLVMMatchType<0>,
                                      LLVMMatchType<0>], [IntrWriteArgMem]>;
 
   // Vector store N-element structure from one lane.
-  def int_arm_neon_vst2lane : Intrinsic<[llvm_void_ty],
+  def int_arm_neon_vst2lane : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_anyvector_ty,
                                          LLVMMatchType<0>, llvm_i32_ty],
                                         [IntrWriteArgMem]>;
-  def int_arm_neon_vst3lane : Intrinsic<[llvm_void_ty],
+  def int_arm_neon_vst3lane : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_anyvector_ty,
                                          LLVMMatchType<0>, LLVMMatchType<0>,
                                          llvm_i32_ty], [IntrWriteArgMem]>;
-  def int_arm_neon_vst4lane : Intrinsic<[llvm_void_ty],
+  def int_arm_neon_vst4lane : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_anyvector_ty,
                                          LLVMMatchType<0>, LLVMMatchType<0>,
                                          LLVMMatchType<0>, llvm_i32_ty],
diff --git a/include/llvm/IntrinsicsPowerPC.td b/include/llvm/IntrinsicsPowerPC.td
index ffb870da1c33..4e959f337c00 100644
--- a/include/llvm/IntrinsicsPowerPC.td
+++ b/include/llvm/IntrinsicsPowerPC.td
@@ -18,17 +18,17 @@
 // Non-altivec intrinsics.
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // dcba/dcbf/dcbi/dcbst/dcbt/dcbz/dcbzl(PPC970) instructions.
-  def int_ppc_dcba  : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
-  def int_ppc_dcbf  : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
-  def int_ppc_dcbi  : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
-  def int_ppc_dcbst : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
-  def int_ppc_dcbt  : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
-  def int_ppc_dcbtst: Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
-  def int_ppc_dcbz  : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
-  def int_ppc_dcbzl : Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcba  : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcbf  : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcbi  : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcbst : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcbt  : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcbtst: Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcbz  : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
+  def int_ppc_dcbzl : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
 
   // sync instruction
-  def int_ppc_sync : Intrinsic<[llvm_void_ty], [], [IntrWriteMem]>;
+  def int_ppc_sync : Intrinsic<[], [], [IntrWriteMem]>;
 }
 
 
@@ -86,23 +86,23 @@ class PowerPC_Vec_WWW_Intrinsic<string GCCIntSuffix>
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // Data Stream Control.
   def int_ppc_altivec_dss : GCCBuiltin<"__builtin_altivec_dss">,
-              Intrinsic<[llvm_void_ty], [llvm_i32_ty], [IntrWriteMem]>;
+              Intrinsic<[], [llvm_i32_ty], [IntrWriteMem]>;
   def int_ppc_altivec_dssall : GCCBuiltin<"__builtin_altivec_dssall">,
-              Intrinsic<[llvm_void_ty], [], [IntrWriteMem]>;
+              Intrinsic<[], [], [IntrWriteMem]>;
   def int_ppc_altivec_dst : GCCBuiltin<"__builtin_altivec_dst">,
-              Intrinsic<[llvm_void_ty],
+              Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrWriteMem]>;
   def int_ppc_altivec_dstt : GCCBuiltin<"__builtin_altivec_dstt">,
-              Intrinsic<[llvm_void_ty],
+              Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrWriteMem]>;
   def int_ppc_altivec_dstst : GCCBuiltin<"__builtin_altivec_dstst">,
-              Intrinsic<[llvm_void_ty],
+              Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrWriteMem]>;
   def int_ppc_altivec_dststt : GCCBuiltin<"__builtin_altivec_dststt">,
-              Intrinsic<[llvm_void_ty],
+              Intrinsic<[],
                         [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrWriteMem]>;
 
@@ -110,7 +110,7 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_mfvscr : GCCBuiltin<"__builtin_altivec_mfvscr">,
               Intrinsic<[llvm_v8i16_ty], [], [IntrReadMem]>;
   def int_ppc_altivec_mtvscr : GCCBuiltin<"__builtin_altivec_mtvscr">,
-              Intrinsic<[llvm_void_ty], [llvm_v4i32_ty], [IntrWriteMem]>;
+              Intrinsic<[], [llvm_v4i32_ty], [IntrWriteMem]>;
 
 
   // Loads.  These don't map directly to GCC builtins because they represent the
@@ -129,19 +129,19 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // Stores.  These don't map directly to GCC builtins because they represent the
   // source address with a single pointer.
   def int_ppc_altivec_stvx :
-              Intrinsic<[llvm_void_ty], [llvm_v4i32_ty, llvm_ptr_ty],
+              Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
                         [IntrWriteMem]>;
   def int_ppc_altivec_stvxl :
-              Intrinsic<[llvm_void_ty], [llvm_v4i32_ty, llvm_ptr_ty],
+              Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
                         [IntrWriteMem]>;
   def int_ppc_altivec_stvebx :
-              Intrinsic<[llvm_void_ty], [llvm_v16i8_ty, llvm_ptr_ty],
+              Intrinsic<[], [llvm_v16i8_ty, llvm_ptr_ty],
                         [IntrWriteMem]>;
   def int_ppc_altivec_stvehx :
-              Intrinsic<[llvm_void_ty], [llvm_v8i16_ty, llvm_ptr_ty],
+              Intrinsic<[], [llvm_v8i16_ty, llvm_ptr_ty],
                         [IntrWriteMem]>;
   def int_ppc_altivec_stvewx :
-              Intrinsic<[llvm_void_ty], [llvm_v4i32_ty, llvm_ptr_ty],
+              Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
                         [IntrWriteMem]>;
 
   // Comparisons setting a vector.
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index 67abd955a1ae..25169b4597d7 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -142,25 +142,25 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // SIMD store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v4f32_ty], [IntrWriteMem]>;
 }
 
 // Cacheability support ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_movnt_ps : GCCBuiltin<"__builtin_ia32_movntps">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v4f32_ty], [IntrWriteMem]>;
   def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">,
-              Intrinsic<[llvm_void_ty], [], [IntrWriteMem]>;
+              Intrinsic<[], [], [IntrWriteMem]>;
 }
 
 // Control register.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_stmxcsr :
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
+              Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
   def int_x86_sse_ldmxcsr :
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
+              Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
 }
 
 // Misc.
@@ -458,26 +458,26 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // SIMD store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v2f64_ty], [IntrWriteMem]>;
   def int_x86_sse2_storeu_dq : GCCBuiltin<"__builtin_ia32_storedqu">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v16i8_ty], [IntrWriteMem]>;
   def int_x86_sse2_storel_dq : GCCBuiltin<"__builtin_ia32_storelv4si">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v4i32_ty], [IntrWriteMem]>;
 }
 
 // Cacheability support ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_movnt_dq : GCCBuiltin<"__builtin_ia32_movntdq">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v2i64_ty], [IntrWriteMem]>;
   def int_x86_sse2_movnt_pd : GCCBuiltin<"__builtin_ia32_movntpd">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v2f64_ty], [IntrWriteMem]>;
   def int_x86_sse2_movnt_i : GCCBuiltin<"__builtin_ia32_movnti">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_i32_ty], [IntrWriteMem]>;
 }
 
@@ -497,14 +497,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_pmovmskb_128 : GCCBuiltin<"__builtin_ia32_pmovmskb128">,
               Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_sse2_maskmov_dqu : GCCBuiltin<"__builtin_ia32_maskmovdqu">,
-              Intrinsic<[llvm_void_ty], [llvm_v16i8_ty,
+              Intrinsic<[], [llvm_v16i8_ty,
                          llvm_v16i8_ty, llvm_ptr_ty], [IntrWriteMem]>;
   def int_x86_sse2_clflush : GCCBuiltin<"__builtin_ia32_clflush">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty], [IntrWriteMem]>;
+              Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem]>;
   def int_x86_sse2_lfence : GCCBuiltin<"__builtin_ia32_lfence">,
-              Intrinsic<[llvm_void_ty], [], [IntrWriteMem]>;
+              Intrinsic<[], [], [IntrWriteMem]>;
   def int_x86_sse2_mfence : GCCBuiltin<"__builtin_ia32_mfence">,
-              Intrinsic<[llvm_void_ty], [], [IntrWriteMem]>;
+              Intrinsic<[], [], [IntrWriteMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -545,10 +545,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Thread synchronization ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse3_monitor : GCCBuiltin<"__builtin_ia32_monitor">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_i32_ty, llvm_i32_ty], [IntrWriteMem]>;
   def int_x86_sse3_mwait : GCCBuiltin<"__builtin_ia32_mwait">,
-              Intrinsic<[llvm_void_ty], [llvm_i32_ty,
+              Intrinsic<[], [llvm_i32_ty,
                          llvm_i32_ty], [IntrWriteMem]>;
 }
 
@@ -779,6 +779,29 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [IntrNoMem, Commutative]>;
 }
 
+// Advanced Encryption Standard (AES) Instructions
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_aesni_aesimc          : GCCBuiltin<"__builtin_ia32_aesimc128">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+                        [IntrNoMem]>;
+  def int_x86_aesni_aesenc          : GCCBuiltin<"__builtin_ia32_aesenc128">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+                        [IntrNoMem]>;
+  def int_x86_aesni_aesenclast : GCCBuiltin<"__builtin_ia32_aesenclast128">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+                        [IntrNoMem]>;
+  def int_x86_aesni_aesdec          : GCCBuiltin<"__builtin_ia32_aesdec128">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+                        [IntrNoMem]>;
+  def int_x86_aesni_aesdeclast : GCCBuiltin<"__builtin_ia32_aesdeclast128">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+                        [IntrNoMem]>;
+  def int_x86_aesni_aeskeygenassist : 
+              GCCBuiltin<"__builtin_ia32_aeskeygenassist">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+}
+
 // Vector pack
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_packusdw        : GCCBuiltin<"__builtin_ia32_packusdw128">,
@@ -791,9 +814,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_pmuldq          : GCCBuiltin<"__builtin_ia32_pmuldq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
-  def int_x86_sse41_pmulld          : GCCBuiltin<"__builtin_ia32_pmulld128">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem, Commutative]>;
 }
 
 // Vector extract
@@ -973,9 +993,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 // Empty MMX state op.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_mmx_emms  : GCCBuiltin<"__builtin_ia32_emms">,
-              Intrinsic<[llvm_void_ty], [], [IntrWriteMem]>;
+              Intrinsic<[], [], [IntrWriteMem]>;
   def int_x86_mmx_femms : GCCBuiltin<"__builtin_ia32_femms">,
-              Intrinsic<[llvm_void_ty], [], [IntrWriteMem]>;
+              Intrinsic<[], [], [IntrWriteMem]>;
 }
 
 // Integer arithmetic ops.
@@ -1151,7 +1171,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_mmx_maskmovq : GCCBuiltin<"__builtin_ia32_maskmovq">,
-              Intrinsic<[llvm_void_ty],
+              Intrinsic<[],
                         [llvm_v8i8_ty, llvm_v8i8_ty, llvm_ptr_ty],
                         [IntrWriteMem]>;
 
@@ -1159,6 +1179,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_i32_ty], [llvm_v8i8_ty], [IntrNoMem]>;
 
   def int_x86_mmx_movnt_dq : GCCBuiltin<"__builtin_ia32_movntq">,
-              Intrinsic<[llvm_void_ty], [llvm_ptr_ty,
+              Intrinsic<[], [llvm_ptr_ty,
                          llvm_v1i64_ty], [IntrWriteMem]>;
 }
diff --git a/include/llvm/LLVMContext.h b/include/llvm/LLVMContext.h
index 6d36d5eb1e37..ea7f4a2d8e9d 100644
--- a/include/llvm/LLVMContext.h
+++ b/include/llvm/LLVMContext.h
@@ -36,6 +36,12 @@ public:
   LLVMContext();
   ~LLVMContext();
   
+  // Pinned metadata names, which always have the same value.  This is a
+  // compile-time performance optimization, not a correctness optimization.
+  enum {
+    MD_dbg = 1   // "dbg" -> 1.
+  };
+  
   /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
   /// This ID is uniqued across modules in the current LLVMContext.
   unsigned getMDKindID(StringRef Name) const;
diff --git a/include/llvm/MC/MCAsmLayout.h b/include/llvm/MC/MCAsmLayout.h
index 27bdbe9cbf59..ebf0520d091f 100644
--- a/include/llvm/MC/MCAsmLayout.h
+++ b/include/llvm/MC/MCAsmLayout.h
@@ -12,6 +12,9 @@
 
 namespace llvm {
 class MCAssembler;
+class MCFragment;
+class MCSectionData;
+class MCSymbolData;
 
 /// Encapsulates the layout of an assembly file at a particular point in time.
 ///
@@ -29,6 +32,64 @@ public:
 
   /// Get the assembler object this is a layout for.
   MCAssembler &getAssembler() const { return Assembler; }
+
+  /// \brief Update the layout because a fragment has been resized. The
+  /// fragments size should have already been updated, the \arg SlideAmount is
+  /// the delta from the old size.
+  void UpdateForSlide(MCFragment *F, int SlideAmount);
+
+  /// @name Fragment Layout Data
+  /// @{
+
+  /// \brief Get the effective size of the given fragment, as computed in the
+  /// current layout.
+  uint64_t getFragmentEffectiveSize(const MCFragment *F) const;
+
+  /// \brief Set the effective size of the given fragment.
+  void setFragmentEffectiveSize(MCFragment *F, uint64_t Value);
+
+  /// \brief Get the offset of the given fragment inside its containing section.
+  uint64_t getFragmentOffset(const MCFragment *F) const;
+
+  /// \brief Set the offset of the given fragment inside its containing section.
+  void setFragmentOffset(MCFragment *F, uint64_t Value);
+
+  /// @}
+  /// @name Section Layout Data
+  /// @{
+
+  /// \brief Get the computed address of the given section.
+  uint64_t getSectionAddress(const MCSectionData *SD) const;
+
+  /// \brief Set the computed address of the given section.
+  void setSectionAddress(MCSectionData *SD, uint64_t Value);
+
+  /// \brief Get the data size of the given section, as emitted to the object
+  /// file. This may include additional padding, or be 0 for virtual sections.
+  uint64_t getSectionFileSize(const MCSectionData *SD) const;
+
+  /// \brief Set the data size of the given section.
+  void setSectionFileSize(MCSectionData *SD, uint64_t Value);
+
+  /// \brief Get the actual data size of the given section.
+  uint64_t getSectionSize(const MCSectionData *SD) const;
+
+  /// \brief Set the actual data size of the given section.
+  void setSectionSize(MCSectionData *SD, uint64_t Value);
+
+  /// @}
+  /// @name Utility Functions
+  /// @{
+
+  /// \brief Get the address of the given fragment, as computed in the current
+  /// layout.
+  uint64_t getFragmentAddress(const MCFragment *F) const;
+
+  /// \brief Get the address of the given symbol, as computed in the current
+  /// layout.
+  uint64_t getSymbolAddress(const MCSymbolData *SD) const;
+
+  /// @}
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 363c7d977eba..c1b60f011fd2 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/System/DataTypes.h"
 #include <vector> // FIXME: Shouldn't be needed.
 
@@ -37,6 +38,8 @@ class TargetAsmBackend;
 /// MCAsmFixup - Represent a fixed size region of bytes inside some fragment
 /// which needs to be rewritten. This region will either be rewritten by the
 /// assembler or cause a relocation entry to be generated.
+//
+// FIXME: This should probably just be merged with MCFixup.
 class MCAsmFixup {
 public:
   /// Offset - The offset inside the fragment which needs to be rewritten.
@@ -54,14 +57,17 @@ public:
 };
 
 class MCFragment : public ilist_node<MCFragment> {
+  friend class MCAsmLayout;
+
   MCFragment(const MCFragment&);     // DO NOT IMPLEMENT
   void operator=(const MCFragment&); // DO NOT IMPLEMENT
 
 public:
   enum FragmentType {
-    FT_Data,
     FT_Align,
+    FT_Data,
     FT_Fill,
+    FT_Inst,
     FT_Org,
     FT_ZeroFill
   };
@@ -81,8 +87,13 @@ private:
   /// initialized.
   uint64_t Offset;
 
-  /// FileSize - The file size of this section. This is ~0 until initialized.
-  uint64_t FileSize;
+  /// EffectiveSize - The compute size of this section. This is ~0 until
+  /// initialized.
+  uint64_t EffectiveSize;
+
+  /// Ordinal - The global index of this fragment. This is the index across all
+  /// sections, not just the parent section.
+  unsigned Ordinal;
 
   /// @}
 
@@ -99,35 +110,8 @@ public:
   MCSectionData *getParent() const { return Parent; }
   void setParent(MCSectionData *Value) { Parent = Value; }
 
-  // FIXME: This should be abstract, fix sentinel.
-  virtual uint64_t getMaxFileSize() const {
-    assert(0 && "Invalid getMaxFileSize call!");
-    return 0;
-  }
-
-  /// @name Assembler Backend Support
-  /// @{
-  //
-  // FIXME: This could all be kept private to the assembler implementation.
-
-  uint64_t getAddress() const;
-
-  uint64_t getFileSize() const {
-    assert(FileSize != ~UINT64_C(0) && "File size not set!");
-    return FileSize;
-  }
-  void setFileSize(uint64_t Value) {
-    assert(Value <= getMaxFileSize() && "Invalid file size!");
-    FileSize = Value;
-  }
-
-  uint64_t getOffset() const {
-    assert(Offset != ~UINT64_C(0) && "File offset not set!");
-    return Offset;
-  }
-  void setOffset(uint64_t Value) { Offset = Value; }
-
-  /// @}
+  unsigned getOrdinal() const { return Ordinal; }
+  void setOrdinal(unsigned Value) { Ordinal = Value; }
 
   static bool classof(const MCFragment *O) { return true; }
 
@@ -150,15 +134,10 @@ public:
   /// @name Accessors
   /// @{
 
-  uint64_t getMaxFileSize() const {
-    return Contents.size();
-  }
-
   SmallString<32> &getContents() { return Contents; }
   const SmallString<32> &getContents() const { return Contents; }
 
   /// @}
-
   /// @name Fixup Access
   /// @{
 
@@ -190,6 +169,68 @@ public:
   virtual void dump();
 };
 
+// FIXME: This current incarnation of MCInstFragment doesn't make much sense, as
+// it is almost entirely a duplicate of MCDataFragment. If we decide to stick
+// with this approach (as opposed to making MCInstFragment a very light weight
+// object with just the MCInst and a code size, then we should just change
+// MCDataFragment to have an optional MCInst at its end.
+class MCInstFragment : public MCFragment {
+  /// Inst - The instruction this is a fragment for.
+  MCInst Inst;
+
+  /// InstSize - The size of the currently encoded instruction.
+  SmallString<8> Code;
+
+  /// Fixups - The list of fixups in this fragment.
+  SmallVector<MCAsmFixup, 1> Fixups;
+
+public:
+  typedef SmallVectorImpl<MCAsmFixup>::const_iterator const_fixup_iterator;
+  typedef SmallVectorImpl<MCAsmFixup>::iterator fixup_iterator;
+
+public:
+  MCInstFragment(MCInst _Inst, MCSectionData *SD = 0)
+    : MCFragment(FT_Inst, SD), Inst(_Inst) {
+  }
+
+  /// @name Accessors
+  /// @{
+
+  SmallVectorImpl<char> &getCode() { return Code; }
+  const SmallVectorImpl<char> &getCode() const { return Code; }
+
+  unsigned getInstSize() const { return Code.size(); }
+
+  MCInst &getInst() { return Inst; }
+  const MCInst &getInst() const { return Inst; }
+
+  void setInst(MCInst Value) { Inst = Value; }
+
+  /// @}
+  /// @name Fixup Access
+  /// @{
+
+  SmallVectorImpl<MCAsmFixup> &getFixups() { return Fixups; }
+  const SmallVectorImpl<MCAsmFixup> &getFixups() const { return Fixups; }
+
+  fixup_iterator fixup_begin() { return Fixups.begin(); }
+  const_fixup_iterator fixup_begin() const { return Fixups.begin(); }
+
+  fixup_iterator fixup_end() {return Fixups.end();}
+  const_fixup_iterator fixup_end() const {return Fixups.end();}
+
+  size_t fixup_size() const { return Fixups.size(); }
+
+  /// @}
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_Inst;
+  }
+  static bool classof(const MCInstFragment *) { return true; }
+
+  virtual void dump();
+};
+
 class MCAlignFragment : public MCFragment {
   /// Alignment - The alignment to ensure, in bytes.
   unsigned Alignment;
@@ -219,10 +260,6 @@ public:
   /// @name Accessors
   /// @{
 
-  uint64_t getMaxFileSize() const {
-    return std::max(Alignment - 1, MaxBytesToEmit);
-  }
-
   unsigned getAlignment() const { return Alignment; }
 
   int64_t getValue() const { return Value; }
@@ -262,10 +299,6 @@ public:
   /// @name Accessors
   /// @{
 
-  uint64_t getMaxFileSize() const {
-    return ValueSize * Count;
-  }
-
   int64_t getValue() const { return Value; }
 
   unsigned getValueSize() const { return ValueSize; }
@@ -297,11 +330,6 @@ public:
   /// @name Accessors
   /// @{
 
-  uint64_t getMaxFileSize() const {
-    // FIXME: This doesn't make much sense.
-    return ~UINT64_C(0);
-  }
-
   const MCExpr &getOffset() const { return *Offset; }
 
   uint8_t getValue() const { return Value; }
@@ -333,11 +361,6 @@ public:
   /// @name Accessors
   /// @{
 
-  uint64_t getMaxFileSize() const {
-    // FIXME: This also doesn't make much sense, this method is misnamed.
-    return ~UINT64_C(0);
-  }
-
   uint64_t getSize() const { return Size; }
 
   unsigned getAlignment() const { return Alignment; }
@@ -356,6 +379,8 @@ public:
 // we anticipate the fast path being through an MCAssembler, the only reason to
 // keep it out is for API abstraction.
 class MCSectionData : public ilist_node<MCSectionData> {
+  friend class MCAsmLayout;
+
   MCSectionData(const MCSectionData&);  // DO NOT IMPLEMENT
   void operator=(const MCSectionData&); // DO NOT IMPLEMENT
 
@@ -372,6 +397,9 @@ private:
   iplist<MCFragment> Fragments;
   const MCSection *Section;
 
+  /// Ordinal - The section index in the assemblers section list.
+  unsigned Ordinal;
+
   /// Alignment - The maximum alignment seen in this section.
   unsigned Alignment;
 
@@ -407,6 +435,12 @@ public:
   unsigned getAlignment() const { return Alignment; }
   void setAlignment(unsigned Value) { Alignment = Value; }
 
+  bool hasInstructions() const { return HasInstructions; }
+  void setHasInstructions(bool Value) { HasInstructions = Value; }
+
+  unsigned getOrdinal() const { return Ordinal; }
+  void setOrdinal(unsigned Value) { Ordinal = Value; }
+
   /// @name Fragment Access
   /// @{
 
@@ -429,36 +463,9 @@ public:
 
   bool empty() const { return Fragments.empty(); }
 
-  /// @}
-  /// @name Assembler Backend Support
-  /// @{
-  //
-  // FIXME: This could all be kept private to the assembler implementation.
-
-  uint64_t getAddress() const {
-    assert(Address != ~UINT64_C(0) && "Address not set!");
-    return Address;
-  }
-  void setAddress(uint64_t Value) { Address = Value; }
-
-  uint64_t getSize() const {
-    assert(Size != ~UINT64_C(0) && "File size not set!");
-    return Size;
-  }
-  void setSize(uint64_t Value) { Size = Value; }
-
-  uint64_t getFileSize() const {
-    assert(FileSize != ~UINT64_C(0) && "File size not set!");
-    return FileSize;
-  }
-  void setFileSize(uint64_t Value) { FileSize = Value; }
-
-  bool hasInstructions() const { return HasInstructions; }
-  void setHasInstructions(bool Value) { HasInstructions = Value; }
-
-  /// @}
-
   void dump();
+
+  /// @}
 };
 
 // FIXME: Same concerns as with SectionData.
@@ -515,11 +522,6 @@ public:
   uint64_t getOffset() const { return Offset; }
   void setOffset(uint64_t Value) { Offset = Value; }
 
-  uint64_t getAddress() const {
-    assert(getFragment() && "Invalid getAddress() on undefined symbol!");
-    return getFragment()->getAddress() + getOffset();
-  }
-
   /// @}
   /// @name Symbol Attributes
   /// @{
@@ -578,6 +580,8 @@ struct IndirectSymbolData {
 };
 
 class MCAssembler {
+  friend class MCAsmLayout;
+
 public:
   typedef iplist<MCSectionData> SectionDataListType;
   typedef iplist<MCSymbolData> SymbolDataListType;
@@ -620,6 +624,7 @@ private:
 
   std::vector<IndirectSymbolData> IndirectSymbols;
 
+  unsigned RelaxAll : 1;
   unsigned SubsectionsViaSymbols : 1;
 
 private:
@@ -637,35 +642,49 @@ private:
   /// \arg Value result is fixed, otherwise the value may change due to
   /// relocation.
   bool EvaluateFixup(const MCAsmLayout &Layout,
-                     MCAsmFixup &Fixup, MCDataFragment *DF,
+                     const MCAsmFixup &Fixup, const MCFragment *DF,
                      MCValue &Target, uint64_t &Value) const;
 
   /// Check whether a fixup can be satisfied, or whether it needs to be relaxed
   /// (increased in size, in order to hold its value correctly).
-  bool FixupNeedsRelaxation(MCAsmFixup &Fixup, MCDataFragment *DF);
+  bool FixupNeedsRelaxation(const MCAsmFixup &Fixup, const MCFragment *DF,
+                            const MCAsmLayout &Layout) const;
 
-  /// LayoutSection - Assign offsets and sizes to the fragments in the section
-  /// \arg SD, and update the section size. The section file offset should
-  /// already have been computed.
-  void LayoutSection(MCSectionData &SD);
+  /// Check whether the given fragment needs relaxation.
+  bool FragmentNeedsRelaxation(const MCInstFragment *IF,
+                               const MCAsmLayout &Layout) const;
+
+  /// LayoutSection - Assign the section the given \arg StartAddress, and then
+  /// assign offsets and sizes to the fragments in the section \arg SD, and
+  /// update the section size.
+  ///
+  /// \return The address at the end of the section, for use in laying out the
+  /// succeeding section.
+  uint64_t LayoutSection(MCSectionData &SD, MCAsmLayout &Layout,
+                         uint64_t StartAddress);
 
   /// LayoutOnce - Perform one layout iteration and return true if any offsets
   /// were adjusted.
-  bool LayoutOnce();
+  bool LayoutOnce(MCAsmLayout &Layout);
+
+  /// FinishLayout - Finalize a layout, including fragment lowering.
+  void FinishLayout(MCAsmLayout &Layout);
 
 public:
   /// Find the symbol which defines the atom containing given address, inside
   /// the given section, or null if there is no such symbol.
   //
-  // FIXME: Eliminate this, it is very slow.
-  const MCSymbolData *getAtomForAddress(const MCSectionData *Section,
+  // FIXME-PERF: Eliminate this, it is very slow.
+  const MCSymbolData *getAtomForAddress(const MCAsmLayout &Layout,
+                                        const MCSectionData *Section,
                                         uint64_t Address) const;
 
   /// Find the symbol which defines the atom containing the given symbol, or
   /// null if there is no such symbol.
   //
-  // FIXME: Eliminate this, it is very slow.
-  const MCSymbolData *getAtom(const MCSymbolData *Symbol) const;
+  // FIXME-PERF: Eliminate this, it is very slow.
+  const MCSymbolData *getAtom(const MCAsmLayout &Layout,
+                              const MCSymbolData *Symbol) const;
 
   /// Check whether a particular symbol is visible to the linker and is required
   /// in the symbol table, or whether it can be discarded by the assembler. This
@@ -676,7 +695,8 @@ public:
   /// Emit the section contents using the given object writer.
   //
   // FIXME: Should MCAssembler always have a reference to the object writer?
-  void WriteSectionData(const MCSectionData *Section, MCObjectWriter *OW) const;
+  void WriteSectionData(const MCSectionData *Section, const MCAsmLayout &Layout,
+                        MCObjectWriter *OW) const;
 
 public:
   /// Construct a new assembler instance.
@@ -708,6 +728,9 @@ public:
     SubsectionsViaSymbols = Value;
   }
 
+  bool getRelaxAll() const { return RelaxAll; }
+  void setRelaxAll(bool Value) { RelaxAll = Value; }
+
   /// @name Section List Access
   /// @{
 
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index c5814b377eb8..968d55f214c7 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -65,19 +65,8 @@ namespace llvm {
     /// reference and return it.
     ///
     /// @param Name - The symbol name, which must be unique across all symbols.
-    MCSymbol *GetOrCreateSymbol(StringRef Name, bool isTemporary = false);
-    MCSymbol *GetOrCreateSymbol(const Twine &Name, bool isTemporary = false);
-
-    /// GetOrCreateTemporarySymbol - Create a new assembler temporary symbol
-    /// with the specified @p Name if it doesn't exist or return the existing
-    /// one if it does.
-    ///
-    /// @param Name - The symbol name, for debugging purposes only, temporary
-    /// symbols do not surive assembly.
-    MCSymbol *GetOrCreateTemporarySymbol(StringRef Name) {
-      return GetOrCreateSymbol(Name, true);
-    }
-    MCSymbol *GetOrCreateTemporarySymbol(const Twine &Name);
+    MCSymbol *GetOrCreateSymbol(StringRef Name);
+    MCSymbol *GetOrCreateSymbol(const Twine &Name);
 
     /// LookupSymbol - Get the symbol for \p Name, or null.
     MCSymbol *LookupSymbol(StringRef Name) const;
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 6efec52751c0..bd0684df67e9 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -160,11 +160,6 @@ public:
   static const MCSymbolRefExpr *Create(StringRef Name, VariantKind Kind,
                                        MCContext &Ctx);
   
-  /// CreateTemp - Create a reference to an assembler temporary label with the
-  /// specified name.
-  static const MCSymbolRefExpr *CreateTemp(StringRef Name, VariantKind Kind,
-                                           MCContext &Ctx);
-
   /// @}
   /// @name Accessors
   /// @{
diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
index 29b38dd15d11..dc630fe2807f 100644
--- a/include/llvm/MC/MCInst.h
+++ b/include/llvm/MC/MCInst.h
@@ -17,11 +17,13 @@
 #define LLVM_MC_MCINST_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/System/DataTypes.h"
 
 namespace llvm {
 class raw_ostream;
 class MCAsmInfo;
+class MCInstPrinter;
 class MCExpr;
 
 /// MCOperand - Instances of this class represent operands of the MCInst class.
@@ -125,6 +127,13 @@ public:
 
   void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
   void dump() const;
+
+  /// \brief Dump the MCInst as prettily as possible using the additional MC
+  /// structures, if given. Operators are separated by the \arg Separator
+  /// string.
+  void dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI = 0,
+                   const MCInstPrinter *Printer = 0,
+                   StringRef Separator = " ") const;
 };
 
 
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index d4fab0ea477f..f70a3d1851ea 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -16,8 +16,9 @@
 
 namespace llvm {
 class MCAsmFixup;
+class MCAsmLayout;
 class MCAssembler;
-class MCDataFragment;
+class MCFragment;
 class MCValue;
 class raw_ostream;
 
@@ -69,7 +70,8 @@ public:
   /// information about the relocation so that it can be emitted during
   /// WriteObject().
   virtual void RecordRelocation(const MCAssembler &Asm,
-                                const MCDataFragment &Fragment,
+                                const MCAsmLayout &Layout,
+                                const MCFragment *Fragment,
                                 const MCAsmFixup &Fixup, MCValue Target,
                                 uint64_t &FixedValue) = 0;
 
@@ -78,7 +80,8 @@ public:
   /// This routine is called by the assembler after layout and relaxation is
   /// complete, fixups have been evaluate and applied, and relocations
   /// generated.
-  virtual void WriteObject(const MCAssembler &Asm) = 0;
+  virtual void WriteObject(const MCAssembler &Asm,
+                           const MCAsmLayout &Layout) = 0;
 
   /// @}
   /// @name Binary Output
diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index 3d8815a7783f..4a1c46ccb926 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h
@@ -42,9 +42,8 @@ namespace llvm {
   };
 
   class MCSectionCOFF : public MCSection {
-    // FIXME: This memory is leaked because MCSectionCOFF is bump pointer
-    // allocated and this never gets freed.
-    std::string Name;
+    // The memory for this string is stored in the same MCContext as *this.
+    StringRef Name;
     
     /// IsDirective - This is true if the section name is a directive, not
     /// something that should be printed with ".section".
@@ -61,7 +60,7 @@ namespace llvm {
     static MCSectionCOFF *Create(StringRef Name, bool IsDirective, 
                                  SectionKind K, MCContext &Ctx);
 
-    const std::string &getName() const { return Name; }
+    StringRef getName() const { return Name; }
     bool isDirective() const { return IsDirective; }
     
     virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 4b088a57d748..bdcfdb2debee 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -88,7 +88,7 @@ class TargetAsmBackend;
     /// @name Symbol & Section Management
     /// @{
     
-    /// getCurrentSection - Return the current seciton that the streamer is
+    /// getCurrentSection - Return the current section that the streamer is
     /// emitting code to.
     const MCSection *getCurrentSection() const { return CurSection; }
 
@@ -308,7 +308,8 @@ class TargetAsmBackend;
   /// createMachOStream - Create a machine code streamer which will generative
   /// Mach-O format object files.
   MCStreamer *createMachOStreamer(MCContext &Ctx, TargetAsmBackend &TAB,
-                                  raw_ostream &OS, MCCodeEmitter *CE);
+                                  raw_ostream &OS, MCCodeEmitter *CE,
+                                  bool RelaxAll = false);
 
 } // end namespace llvm
 
diff --git a/include/llvm/MC/MachObjectWriter.h b/include/llvm/MC/MachObjectWriter.h
index 3e3305f083a2..844025d15c4a 100644
--- a/include/llvm/MC/MachObjectWriter.h
+++ b/include/llvm/MC/MachObjectWriter.h
@@ -17,7 +17,7 @@
 namespace llvm {
 class MCAsmFixup;
 class MCAssembler;
-class MCDataFragment;
+class MCFragment;
 class MCValue;
 class raw_ostream;
 
@@ -31,11 +31,12 @@ public:
   virtual void ExecutePostLayoutBinding(MCAssembler &Asm);
 
   virtual void RecordRelocation(const MCAssembler &Asm,
-                                const MCDataFragment &Fragment,
+                                const MCAsmLayout &Layout,
+                                const MCFragment *Fragment,
                                 const MCAsmFixup &Fixup, MCValue Target,
                                 uint64_t &FixedValue);
 
-  virtual void WriteObject(const MCAssembler &Asm);
+  virtual void WriteObject(const MCAssembler &Asm, const MCAsmLayout &Layout);
 };
 
 } // End llvm namespace
diff --git a/include/llvm/PassManagers.h b/include/llvm/PassManagers.h
index d5685c664c23..ed1e80eae69c 100644
--- a/include/llvm/PassManagers.h
+++ b/include/llvm/PassManagers.h
@@ -413,7 +413,6 @@ private:
 /// It batches all function passes and basic block pass managers together and 
 /// sequence them to process one function at a time before processing next 
 /// function.
-
 class FPPassManager : public ModulePass, public PMDataManager {
 public:
   static char ID;
@@ -462,8 +461,7 @@ public:
   }
 };
 
-extern Timer *StartPassTimer(Pass *);
-extern void StopPassTimer(Pass *, Timer *);
+Timer *getPassTimer(Pass *);
 
 }
 
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index b1f59dc05e7b..bd381807e0c6 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -133,6 +133,7 @@ class BumpPtrAllocator {
 
   static MallocSlabAllocator DefaultSlabAllocator;
 
+  template<typename T> friend class SpecificBumpPtrAllocator;
 public:
   BumpPtrAllocator(size_t size = 4096, size_t threshold = 4096,
                    SlabAllocator &allocator = DefaultSlabAllocator);
@@ -176,6 +177,45 @@ public:
   void PrintStats() const;
 };
 
+/// SpecificBumpPtrAllocator - Same as BumpPtrAllocator but allows only
+/// elements of one type to be allocated. This allows calling the destructor
+/// in DestroyAll() and when the allocator is destroyed.
+template <typename T>
+class SpecificBumpPtrAllocator {
+  BumpPtrAllocator Allocator;
+public:
+  SpecificBumpPtrAllocator(size_t size = 4096, size_t threshold = 4096,
+              SlabAllocator &allocator = BumpPtrAllocator::DefaultSlabAllocator)
+    : Allocator(size, threshold, allocator) {}
+
+  ~SpecificBumpPtrAllocator() {
+    DestroyAll();
+  }
+
+  /// Call the destructor of each allocated object and deallocate all but the
+  /// current slab and reset the current pointer to the beginning of it, freeing
+  /// all memory allocated so far.
+  void DestroyAll() {
+    MemSlab *Slab = Allocator.CurSlab;
+    while (Slab) {
+      char *End = Slab == Allocator.CurSlab ? Allocator.CurPtr :
+                                              (char *)Slab + Slab->Size;
+      for (char *Ptr = (char*)Slab+1; Ptr < End; Ptr += sizeof(T)) {
+        Ptr = Allocator.AlignPtr(Ptr, alignof<T>());
+	if (Ptr + sizeof(T) <= End)
+          reinterpret_cast<T*>(Ptr)->~T();
+      }
+      Slab = Slab->NextPtr;
+    }
+    Allocator.Reset();
+  }
+
+  /// Allocate space for a specific count of elements.
+  T *Allocate(size_t num = 1) {
+    return Allocator.Allocate<T>(num);
+  }
+};
+
 }  // end namespace llvm
 
 inline void *operator new(size_t Size, llvm::BumpPtrAllocator &Allocator) {
diff --git a/include/llvm/Support/CFG.h b/include/llvm/Support/CFG.h
index 3875f0b4a0fd..57699c708890 100644
--- a/include/llvm/Support/CFG.h
+++ b/include/llvm/Support/CFG.h
@@ -67,15 +67,15 @@ public:
 
 typedef PredIterator<BasicBlock, Value::use_iterator> pred_iterator;
 typedef PredIterator<const BasicBlock,
-                     Value::use_const_iterator> pred_const_iterator;
+                     Value::const_use_iterator> const_pred_iterator;
 
 inline pred_iterator pred_begin(BasicBlock *BB) { return pred_iterator(BB); }
-inline pred_const_iterator pred_begin(const BasicBlock *BB) {
-  return pred_const_iterator(BB);
+inline const_pred_iterator pred_begin(const BasicBlock *BB) {
+  return const_pred_iterator(BB);
 }
 inline pred_iterator pred_end(BasicBlock *BB) { return pred_iterator(BB, true);}
-inline pred_const_iterator pred_end(const BasicBlock *BB) {
-  return pred_const_iterator(BB, true);
+inline const_pred_iterator pred_end(const BasicBlock *BB) {
+  return const_pred_iterator(BB, true);
 }
 
 
@@ -268,7 +268,7 @@ template <> struct GraphTraits<Inverse<BasicBlock*> > {
 
 template <> struct GraphTraits<Inverse<const BasicBlock*> > {
   typedef const BasicBlock NodeType;
-  typedef pred_const_iterator ChildIteratorType;
+  typedef const_pred_iterator ChildIteratorType;
   static NodeType *getEntryNode(Inverse<const BasicBlock*> G) {
     return G.Graph;
   }
diff --git a/include/llvm/Support/CallSite.h b/include/llvm/Support/CallSite.h
index 285b558afccb..9f527e2df6cf 100644
--- a/include/llvm/Support/CallSite.h
+++ b/include/llvm/Support/CallSite.h
@@ -8,15 +8,18 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines the CallSite class, which is a handy wrapper for code that
-// wants to treat Call and Invoke instructions in a generic way.
+// wants to treat Call and Invoke instructions in a generic way. When in non-
+// mutation context (e.g. an analysis) ImmutableCallSite should be used.
+// Finally, when some degree of customization is necessary between these two
+// extremes, CallSiteBase<> can be supplied with fine-tuned parameters.
 //
-// NOTE: This class is supposed to have "value semantics". So it should be
-// passed by value, not by reference; it should not be "new"ed or "delete"d. It
-// is efficiently copyable, assignable and constructable, with cost equivalent
-// to copying a pointer (notice that it has only a single data member).
-// The internal representation carries a flag which indicates which of the two
-// variants is enclosed. This allows for cheaper checks when various accessors
-// of CallSite are employed.
+// NOTE: These classes are supposed to have "value semantics". So they should be
+// passed by value, not by reference; they should not be "new"ed or "delete"d.
+// They are efficiently copyable, assignable and constructable, with cost
+// equivalent to copying a pointer (notice that they have only a single data
+// member). The internal representation carries a flag which indicates which of
+// the two variants is enclosed. This allows for cheaper checks when various
+// accessors of CallSite are employed.
 //
 //===----------------------------------------------------------------------===//
 
@@ -34,13 +37,166 @@ namespace llvm {
 class CallInst;
 class InvokeInst;
 
-class CallSite {
-  PointerIntPair<Instruction*, 1, bool> I;
+template <typename FunTy = const Function,
+          typename ValTy = const Value,
+          typename UserTy = const User,
+          typename InstrTy = const Instruction,
+          typename CallTy = const CallInst,
+          typename InvokeTy = const InvokeInst,
+          typename IterTy = User::const_op_iterator>
+class CallSiteBase {
+protected:
+  PointerIntPair<InstrTy*, 1, bool> I;
 public:
-  CallSite() : I(0, false) {}
-  CallSite(CallInst *CI) : I(reinterpret_cast<Instruction*>(CI), true) {}
-  CallSite(InvokeInst *II) : I(reinterpret_cast<Instruction*>(II), false) {}
-  CallSite(Instruction *C);
+  CallSiteBase() : I(0, false) {}
+  CallSiteBase(CallTy *CI) : I(reinterpret_cast<InstrTy*>(CI), true) {}
+  CallSiteBase(InvokeTy *II) : I(reinterpret_cast<InstrTy*>(II), false) {}
+  CallSiteBase(ValTy *II) { *this = get(II); }
+  CallSiteBase(InstrTy *II) {
+    assert(II && "Null instruction given?");
+    *this = get(II);
+    assert(I.getPointer());
+  }
+
+  /// CallSiteBase::get - This static method is sort of like a constructor.  It
+  /// will create an appropriate call site for a Call or Invoke instruction, but
+  /// it can also create a null initialized CallSiteBase object for something
+  /// which is NOT a call site.
+  ///
+  static CallSiteBase get(ValTy *V) {
+    if (InstrTy *II = dyn_cast<InstrTy>(V)) {
+      if (II->getOpcode() == Instruction::Call)
+        return CallSiteBase(reinterpret_cast<CallTy*>(II));
+      else if (II->getOpcode() == Instruction::Invoke)
+        return CallSiteBase(reinterpret_cast<InvokeTy*>(II));
+    }
+    return CallSiteBase();
+  }
+
+  /// isCall - true if a CallInst is enclosed.
+  /// Note that !isCall() does not mean it is an InvokeInst enclosed,
+  /// it also could signify a NULL Instruction pointer.
+  bool isCall() const { return I.getInt(); }
+
+  /// isInvoke - true if a InvokeInst is enclosed.
+  ///
+  bool isInvoke() const { return getInstruction() && !I.getInt(); }
+
+  InstrTy *getInstruction() const { return I.getPointer(); }
+  InstrTy *operator->() const { return I.getPointer(); }
+  operator bool() const { return I.getPointer(); }
+
+  /// getCalledValue - Return the pointer to function that is being called...
+  ///
+  ValTy *getCalledValue() const {
+    assert(getInstruction() && "Not a call or invoke instruction!");
+    return *getCallee();
+  }
+
+  /// getCalledFunction - Return the function being called if this is a direct
+  /// call, otherwise return null (if it's an indirect call).
+  ///
+  FunTy *getCalledFunction() const {
+    return dyn_cast<FunTy>(getCalledValue());
+  }
+
+  /// setCalledFunction - Set the callee to the specified value...
+  ///
+  void setCalledFunction(Value *V) {
+    assert(getInstruction() && "Not a call or invoke instruction!");
+    *getCallee() = V;
+  }
+
+  /// isCallee - Determine whether the passed iterator points to the
+  /// callee operand's Use.
+  ///
+  bool isCallee(value_use_iterator<UserTy> UI) const {
+    return getCallee() == &UI.getUse();
+  }
+
+  ValTy *getArgument(unsigned ArgNo) const {
+    assert(arg_begin() + ArgNo < arg_end() && "Argument # out of range!");
+    return *(arg_begin()+ArgNo);
+  }
+
+  void setArgument(unsigned ArgNo, Value* newVal) {
+    assert(getInstruction() && "Not a call or invoke instruction!");
+    assert(arg_begin() + ArgNo < arg_end() && "Argument # out of range!");
+    getInstruction()->setOperand(getArgumentOffset() + ArgNo, newVal);
+  }
+
+  /// Given a value use iterator, returns the argument that corresponds to it.
+  /// Iterator must actually correspond to an argument.
+  unsigned getArgumentNo(value_use_iterator<UserTy> I) const {
+    assert(getInstruction() && "Not a call or invoke instruction!");
+    assert(arg_begin() <= &I.getUse() && &I.getUse() < arg_end()
+           && "Argument # out of range!");
+    return &I.getUse() - arg_begin();
+  }
+
+  /// arg_iterator - The type of iterator to use when looping over actual
+  /// arguments at this call site...
+  typedef IterTy arg_iterator;
+
+  /// arg_begin/arg_end - Return iterators corresponding to the actual argument
+  /// list for a call site.
+  IterTy arg_begin() const {
+    assert(getInstruction() && "Not a call or invoke instruction!");
+    // Skip non-arguments
+    return (*this)->op_begin() + getArgumentOffset();
+  }
+
+  IterTy arg_end() const { return (*this)->op_end() - getArgumentEndOffset(); }
+  bool arg_empty() const { return arg_end() == arg_begin(); }
+  unsigned arg_size() const { return unsigned(arg_end() - arg_begin()); }
+  
+private:
+  /// Returns the operand number of the first argument
+  unsigned getArgumentOffset() const {
+    if (isCall())
+      return 1; // Skip Function (ATM)
+    else
+      return 0; // Args are at the front
+  }
+
+  unsigned getArgumentEndOffset() const {
+    if (isCall())
+      return 0; // Unchanged (ATM)
+    else
+      return 3; // Skip BB, BB, Function
+  }
+
+  IterTy getCallee() const {
+      // FIXME: this is slow, since we do not have the fast versions
+      // of the op_*() functions here. See CallSite::getCallee.
+      //
+    if (isCall())
+      return getInstruction()->op_begin(); // Unchanged (ATM)
+    else
+      return getInstruction()->op_end() - 3; // Skip BB, BB, Function
+  }
+};
+
+/// ImmutableCallSite - establish a view to a call site for examination
+class ImmutableCallSite : public CallSiteBase<> {
+  typedef CallSiteBase<> _Base;
+public:
+  ImmutableCallSite(const Value* V) : _Base(V) {}
+  ImmutableCallSite(const CallInst *CI) : _Base(CI) {}
+  ImmutableCallSite(const InvokeInst *II) : _Base(II) {}
+  ImmutableCallSite(const Instruction *II) : _Base(II) {}
+};
+
+class CallSite : public CallSiteBase<Function, Value, User, Instruction,
+                                     CallInst, InvokeInst, User::op_iterator> {
+  typedef CallSiteBase<Function, Value, User, Instruction,
+                       CallInst, InvokeInst, User::op_iterator> _Base;
+public:
+  CallSite() {}
+  CallSite(_Base B) : _Base(B) {}
+  CallSite(CallInst *CI) : _Base(CI) {}
+  CallSite(InvokeInst *II) : _Base(II) {}
+  CallSite(Instruction *II) : _Base(II) {}
 
   bool operator==(const CallSite &CS) const { return I == CS.I; }
   bool operator!=(const CallSite &CS) const { return I != CS.I; }
@@ -51,13 +207,7 @@ public:
   /// NOT a call site.
   ///
   static CallSite get(Value *V) {
-    if (Instruction *I = dyn_cast<Instruction>(V)) {
-      if (I->getOpcode() == Instruction::Call)
-        return CallSite(reinterpret_cast<CallInst*>(I));
-      else if (I->getOpcode() == Instruction::Invoke)
-        return CallSite(reinterpret_cast<InvokeInst*>(I));
-    }
-    return CallSite();
+    return _Base::get(V);
   }
 
   /// getCallingConv/setCallingConv - get or set the calling convention of the
@@ -76,6 +226,10 @@ public:
   /// @brief Extract the alignment for a call or parameter (0=unknown).
   uint16_t getParamAlignment(uint16_t i) const;
 
+  /// @brief Return true if the call should not be inlined.
+  bool isNoInline() const;
+  void setIsNoInline(bool Value = true);
+  
   /// @brief Determine if the call does not access memory.
   bool doesNotAccessMemory() const;
   void setDoesNotAccessMemory(bool doesNotAccessMemory = true);
@@ -94,103 +248,22 @@ public:
 
   /// getType - Return the type of the instruction that generated this call site
   ///
-  const Type *getType() const { return getInstruction()->getType(); }
-
-  /// isCall - true if a CallInst is enclosed.
-  /// Note that !isCall() does not mean it is an InvokeInst enclosed,
-  /// it also could signify a NULL Instruction pointer.
-  bool isCall() const { return I.getInt(); }
-
-  /// isInvoke - true if a InvokeInst is enclosed.
-  ///
-  bool isInvoke() const { return getInstruction() && !I.getInt(); }
-
-  /// getInstruction - Return the instruction this call site corresponds to
-  ///
-  Instruction *getInstruction() const { return I.getPointer(); }
+  const Type *getType() const { return (*this)->getType(); }
 
   /// getCaller - Return the caller function for this call site
   ///
-  Function *getCaller() const { return getInstruction()
-                                  ->getParent()->getParent(); }
-
-  /// getCalledValue - Return the pointer to function that is being called...
-  ///
-  Value *getCalledValue() const {
-    assert(getInstruction() && "Not a call or invoke instruction!");
-    return getInstruction()->getOperand(0);
-  }
-
-  /// getCalledFunction - Return the function being called if this is a direct
-  /// call, otherwise return null (if it's an indirect call).
-  ///
-  Function *getCalledFunction() const {
-    return dyn_cast<Function>(getCalledValue());
-  }
-
-  /// setCalledFunction - Set the callee to the specified value...
-  ///
-  void setCalledFunction(Value *V) {
-    assert(getInstruction() && "Not a call or invoke instruction!");
-    getInstruction()->setOperand(0, V);
-  }
-
-  Value *getArgument(unsigned ArgNo) const {
-    assert(arg_begin() + ArgNo < arg_end() && "Argument # out of range!");
-    return *(arg_begin()+ArgNo);
-  }
-
-  void setArgument(unsigned ArgNo, Value* newVal) {
-    assert(getInstruction() && "Not a call or invoke instruction!");
-    assert(arg_begin() + ArgNo < arg_end() && "Argument # out of range!");
-    getInstruction()->setOperand(getArgumentOffset() + ArgNo, newVal);
-  }
-
-  /// Given an operand number, returns the argument that corresponds to it.
-  /// OperandNo must be a valid operand number that actually corresponds to an
-  /// argument.
-  unsigned getArgumentNo(unsigned OperandNo) const {
-    assert(OperandNo >= getArgumentOffset() && "Operand number passed was not "
-                                               "a valid argument");
-    return OperandNo - getArgumentOffset();
-  }
+  Function *getCaller() const { return (*this)->getParent()->getParent(); }
 
   /// hasArgument - Returns true if this CallSite passes the given Value* as an
   /// argument to the called function.
   bool hasArgument(const Value *Arg) const;
 
-  /// arg_iterator - The type of iterator to use when looping over actual
-  /// arguments at this call site...
-  typedef User::op_iterator arg_iterator;
-
-  /// arg_begin/arg_end - Return iterators corresponding to the actual argument
-  /// list for a call site.
-  arg_iterator arg_begin() const {
-    assert(getInstruction() && "Not a call or invoke instruction!");
-    // Skip non-arguments
-    return getInstruction()->op_begin() + getArgumentOffset();
-  }
-
-  arg_iterator arg_end() const { return getInstruction()->op_end(); }
-  bool arg_empty() const { return arg_end() == arg_begin(); }
-  unsigned arg_size() const { return unsigned(arg_end() - arg_begin()); }
-
   bool operator<(const CallSite &CS) const {
     return getInstruction() < CS.getInstruction();
   }
 
-  bool isCallee(Value::use_iterator UI) const {
-    return getInstruction()->op_begin() == &UI.getUse();
-  }
-
 private:
-  /// Returns the operand number of the first argument
-  unsigned getArgumentOffset() const {
-    if (isCall())
-      return 1; // Skip Function
-    else
-      return 3; // Skip Function, BB, BB
-  }
+  User::op_iterator getCallee() const;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index 17bcb5990599..dccbfadfa305 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -50,9 +50,11 @@ template<typename From> struct simplify_type<const From> {
 //  if (isa<Type*>(myVal)) { ... }
 //
 template <typename To, typename From>
-inline bool isa_impl(const From &Val) {
-  return To::classof(&Val);
-}
+struct isa_impl {
+  static inline bool doit(const From &Val) {
+    return To::classof(&Val);
+  }
+};
 
 template<typename To, typename From, typename SimpleType>
 struct isa_impl_wrap {
@@ -68,7 +70,7 @@ template<typename To, typename FromTy>
 struct isa_impl_wrap<To, const FromTy, const FromTy> {
   // When From == SimpleType, we are as simple as we are going to get.
   static bool doit(const FromTy &Val) {
-    return isa_impl<To,FromTy>(Val);
+    return isa_impl<To,FromTy>::doit(Val);
   }
 };
 
@@ -251,10 +253,12 @@ struct foo {
     }*/
 };
 
-template <> inline bool isa_impl<foo,bar>(const bar &Val) {
-  dbgs() << "Classof: " << &Val << "\n";
-  return true;
-}
+template <> struct isa_impl<foo,bar> {
+  static inline bool doit(const bar &Val) {
+    dbgs() << "Classof: " << &Val << "\n";
+    return true;
+  }
+};
 
 
 bar *fub();
diff --git a/include/llvm/Support/DebugLoc.h b/include/llvm/Support/DebugLoc.h
index 32631fcb34e5..ede1ed305450 100644
--- a/include/llvm/Support/DebugLoc.h
+++ b/include/llvm/Support/DebugLoc.h
@@ -1,4 +1,4 @@
-//===---- llvm/DebugLoc.h - Debug Location Information ----------*- C++ -*-===//
+//===---- llvm/Support/DebugLoc.h - Debug Location Information --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,12 +20,71 @@
 
 namespace llvm {
   class MDNode;
+  class LLVMContext;
+  
+  /// DebugLoc - Debug location id.  This is carried by Instruction, SDNode,
+  /// and MachineInstr to compactly encode file/line/scope information for an
+  /// operation.
+  class NewDebugLoc {
+    /// LineCol - This 32-bit value encodes the line and column number for the
+    /// location, encoded as 24-bits for line and 8 bits for col.  A value of 0
+    /// for either means unknown.
+    unsigned LineCol;
+    
+    /// ScopeIdx - This is an opaque ID# for Scope/InlinedAt information,
+    /// decoded by LLVMContext.  0 is unknown.
+    int ScopeIdx;
+  public:
+    NewDebugLoc() : LineCol(0), ScopeIdx(0) {}  // Defaults to unknown.
+    
+    /// get - Get a new DebugLoc that corresponds to the specified line/col
+    /// scope/inline location.
+    static NewDebugLoc get(unsigned Line, unsigned Col,
+                           MDNode *Scope, MDNode *InlinedAt = 0);
+    
+    /// getFromDILocation - Translate the DILocation quad into a NewDebugLoc.
+    static NewDebugLoc getFromDILocation(MDNode *N);
+    
+    /// isUnknown - Return true if this is an unknown location.
+    bool isUnknown() const { return ScopeIdx == 0; }
+    
+    unsigned getLine() const {
+      return (LineCol << 8) >> 8;  // Mask out column.
+    }
+    
+    unsigned getCol() const {
+      return LineCol >> 24;
+    }
+    
+    /// getScope - This returns the scope pointer for this DebugLoc, or null if
+    /// invalid.
+    MDNode *getScope(const LLVMContext &Ctx) const;
+    
+    /// getInlinedAt - This returns the InlinedAt pointer for this DebugLoc, or
+    /// null if invalid or not present.
+    MDNode *getInlinedAt(const LLVMContext &Ctx) const;
+    
+    /// getScopeAndInlinedAt - Return both the Scope and the InlinedAt values.
+    void getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA,
+                              const LLVMContext &Ctx) const;
+    
+    
+    /// getAsMDNode - This method converts the compressed DebugLoc node into a
+    /// DILocation compatible MDNode.
+    MDNode *getAsMDNode(const LLVMContext &Ctx) const;
+    
+    bool operator==(const NewDebugLoc &DL) const {
+      return LineCol == DL.LineCol && ScopeIdx == DL.ScopeIdx;
+    }
+    bool operator!=(const NewDebugLoc &DL) const { return !(*this == DL); }
+  };
+  
+  
 
   /// DebugLoc - Debug location id. This is carried by SDNode and MachineInstr
   /// to index into a vector of unique debug location tuples.
   class DebugLoc {
     unsigned Idx;
-
   public:
     DebugLoc() : Idx(~0U) {}  // Defaults to invalid.
 
@@ -42,7 +101,7 @@ namespace llvm {
     bool operator!=(const DebugLoc &DL) const { return !(*this == DL); }
   };
 
-    /// DebugLocTracker - This class tracks debug location information.
+  /// DebugLocTracker - This class tracks debug location information.
   ///
   struct DebugLocTracker {
     /// DebugLocations - A vector of unique DebugLocTuple's.
diff --git a/include/llvm/Support/FileUtilities.h b/include/llvm/Support/FileUtilities.h
index cc8f95372b11..d0dd4a759888 100644
--- a/include/llvm/Support/FileUtilities.h
+++ b/include/llvm/Support/FileUtilities.h
@@ -40,6 +40,8 @@ namespace llvm {
     sys::Path Filename;
     bool DeleteIt;
   public:
+    FileRemover() : DeleteIt(false) {}
+
     explicit FileRemover(const sys::Path &filename, bool deleteIt = true)
       : Filename(filename), DeleteIt(deleteIt) {}
 
@@ -50,6 +52,17 @@ namespace llvm {
       }
     }
 
+    /// setFile - Give ownership of the file to the FileRemover so it will
+    /// be removed when the object is destroyed.  If the FileRemover already
+    /// had ownership of a file, remove it first.
+    void setFile(const sys::Path &filename, bool deleteIt = true) {
+      if (DeleteIt)
+        Filename.eraseFromDisk();
+
+      Filename = filename;
+      DeleteIt = deleteIt;
+    }
+
     /// releaseFile - Take ownership of the file away from the FileRemover so it
     /// will not be removed when the object is destroyed.
     void releaseFile() { DeleteIt = false; }
diff --git a/include/llvm/Support/IRBuilder.h b/include/llvm/Support/IRBuilder.h
index 1f4e598990d6..c352625aeb4d 100644
--- a/include/llvm/Support/IRBuilder.h
+++ b/include/llvm/Support/IRBuilder.h
@@ -40,8 +40,7 @@ protected:
 
 /// IRBuilderBase - Common base class shared among various IRBuilders.
 class IRBuilderBase {
-  unsigned DbgMDKind;
-  MDNode *CurDbgLocation;
+  NewDebugLoc CurDbgLocation;
 protected:
   BasicBlock *BB;
   BasicBlock::iterator InsertPt;
@@ -49,7 +48,7 @@ protected:
 public:
   
   IRBuilderBase(LLVMContext &context)
-    : DbgMDKind(0), CurDbgLocation(0), Context(context) {
+    : Context(context) {
     ClearInsertionPoint();
   }
   
@@ -65,6 +64,7 @@ public:
   
   BasicBlock *GetInsertBlock() const { return BB; }
   BasicBlock::iterator GetInsertPoint() const { return InsertPt; }
+  LLVMContext &getContext() const { return Context; }
   
   /// SetInsertPoint - This specifies that created instructions should be
   /// appended to the end of the specified block.
@@ -82,12 +82,20 @@ public:
   
   /// SetCurrentDebugLocation - Set location information used by debugging
   /// information.
-  void SetCurrentDebugLocation(MDNode *L);
-  MDNode *getCurrentDebugLocation() const { return CurDbgLocation; }
+  void SetCurrentDebugLocation(const NewDebugLoc &L) {
+    CurDbgLocation = L;
+  }
+  
+  /// getCurrentDebugLocation - Get location information used by debugging
+  /// information.
+  const NewDebugLoc &getCurrentDebugLocation() const { return CurDbgLocation; }
   
   /// SetInstDebugLocation - If this builder has a current debug location, set
   /// it on the specified instruction.
-  void SetInstDebugLocation(Instruction *I) const;
+  void SetInstDebugLocation(Instruction *I) const {
+    if (!CurDbgLocation.isUnknown())
+      I->setDebugLoc(CurDbgLocation);
+  }
 
   //===--------------------------------------------------------------------===//
   // Miscellaneous creation methods.
@@ -208,7 +216,7 @@ public:
   template<typename InstTy>
   InstTy *Insert(InstTy *I, const Twine &Name = "") const {
     this->InsertHelper(I, Name, BB, InsertPt);
-    if (getCurrentDebugLocation() != 0)
+    if (!getCurrentDebugLocation().isUnknown())
       this->SetInstDebugLocation(I);
     return I;
   }
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index fa12416aeac7..9c5f32cd5ff2 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -32,35 +32,43 @@ inline uint32_t Lo_32(uint64_t Value) {
   return static_cast<uint32_t>(Value);
 }
 
-/// is?Type - these functions produce optimal testing for integer data types.
-inline bool isInt8  (int64_t Value) {
-  return static_cast<int8_t>(Value) == Value;
-}
-inline bool isUInt8 (int64_t Value) {
-  return static_cast<uint8_t>(Value) == Value;
-}
-inline bool isInt16 (int64_t Value) {
-  return static_cast<int16_t>(Value) == Value;
-}
-inline bool isUInt16(int64_t Value) {
-  return static_cast<uint16_t>(Value) == Value;
-}
-inline bool isInt32 (int64_t Value) {
-  return static_cast<int32_t>(Value) == Value;
-}
-inline bool isUInt32(int64_t Value) {
-  return static_cast<uint32_t>(Value) == Value;
-}
-
+/// isInt - Checks if an integer fits into the given bit width.
 template<unsigned N>
 inline bool isInt(int64_t x) {
   return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1)));
 }
+// Template specializations to get better code for common cases.
+template<>
+inline bool isInt<8>(int64_t x) {
+  return static_cast<int8_t>(x) == x;
+}
+template<>
+inline bool isInt<16>(int64_t x) {
+  return static_cast<int16_t>(x) == x;
+}
+template<>
+inline bool isInt<32>(int64_t x) {
+  return static_cast<int32_t>(x) == x;
+}
 
+/// isUInt - Checks if an unsigned integer fits into the given bit width.
 template<unsigned N>
-inline bool isUint(uint64_t x) {
+inline bool isUInt(uint64_t x) {
   return N >= 64 || x < (UINT64_C(1)<<N);
 }
+// Template specializations to get better code for common cases.
+template<>
+inline bool isUInt<8>(uint64_t x) {
+  return static_cast<uint8_t>(x) == x;
+}
+template<>
+inline bool isUInt<16>(uint64_t x) {
+  return static_cast<uint16_t>(x) == x;
+}
+template<>
+inline bool isUInt<32>(uint64_t x) {
+  return static_cast<uint32_t>(x) == x;
+}
 
 /// isMask_32 - This function returns true if the argument is a sequence of ones
 /// starting at the least significant bit with the remainder zero (32 bit
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index 8a0f55d9b93a..00dfeaa4fadf 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -16,16 +16,63 @@
 #define LLVM_SUPPORT_TIMER_H
 
 #include "llvm/System/DataTypes.h"
-#include "llvm/System/Mutex.h"
+#include "llvm/ADT/StringRef.h"
+#include <cassert>
 #include <string>
 #include <vector>
-#include <cassert>
+#include <utility>
 
 namespace llvm {
 
+class Timer;
 class TimerGroup;
 class raw_ostream;
 
+class TimeRecord {
+  double WallTime;       // Wall clock time elapsed in seconds
+  double UserTime;       // User time elapsed
+  double SystemTime;     // System time elapsed
+  ssize_t MemUsed;       // Memory allocated (in bytes)
+public:
+  TimeRecord() : WallTime(0), UserTime(0), SystemTime(0), MemUsed(0) {}
+  
+  /// getCurrentTime - Get the current time and memory usage.  If Start is true
+  /// we get the memory usage before the time, otherwise we get time before
+  /// memory usage.  This matters if the time to get the memory usage is
+  /// significant and shouldn't be counted as part of a duration.
+  static TimeRecord getCurrentTime(bool Start = true);
+  
+  double getProcessTime() const { return UserTime+SystemTime; }
+  double getUserTime() const { return UserTime; }
+  double getSystemTime() const { return SystemTime; }
+  double getWallTime() const { return WallTime; }
+  ssize_t getMemUsed() const { return MemUsed; }
+  
+  
+  // operator< - Allow sorting.
+  bool operator<(const TimeRecord &T) const {
+    // Sort by Wall Time elapsed, as it is the only thing really accurate
+    return WallTime < T.WallTime;
+  }
+  
+  void operator+=(const TimeRecord &RHS) {
+    WallTime   += RHS.WallTime;
+    UserTime   += RHS.UserTime;
+    SystemTime += RHS.SystemTime;
+    MemUsed    += RHS.MemUsed;
+  }
+  void operator-=(const TimeRecord &RHS) {
+    WallTime   -= RHS.WallTime;
+    UserTime   -= RHS.UserTime;
+    SystemTime -= RHS.SystemTime;
+    MemUsed    -= RHS.MemUsed;
+  }
+  
+  /// print - Print the current timer to standard error, and reset the "Started"
+  /// flag.
+  void print(const TimeRecord &Total, raw_ostream &OS) const;
+};
+  
 /// Timer - This class is used to track the amount of time spent between
 /// invocations of its startTimer()/stopTimer() methods.  Given appropriate OS
 /// support it can also keep track of the RSS of the program at various points.
@@ -35,65 +82,32 @@ class raw_ostream;
 /// if they are never started.
 ///
 class Timer {
-  double Elapsed;        // Wall clock time elapsed in seconds
-  double UserTime;       // User time elapsed
-  double SystemTime;     // System time elapsed
-  ssize_t MemUsed;       // Memory allocated (in bytes)
-  size_t PeakMem;        // Peak memory used
-  size_t PeakMemBase;    // Temporary for peak calculation...
-  std::string Name;      // The name of this time variable
+  TimeRecord Time;
+  std::string Name;      // The name of this time variable.
   bool Started;          // Has this time variable ever been started?
   TimerGroup *TG;        // The TimerGroup this Timer is in.
-  mutable sys::SmartMutex<true> Lock; // Mutex for the contents of this Timer.
+  
+  Timer **Prev, *Next;   // Doubly linked list of timers in the group.
 public:
-  explicit Timer(const std::string &N);
-  Timer(const std::string &N, TimerGroup &tg);
-  Timer(const Timer &T);
-  ~Timer();
-
-  double getProcessTime() const { return UserTime+SystemTime; }
-  double getWallTime() const { return Elapsed; }
-  ssize_t getMemUsed() const { return MemUsed; }
-  size_t getPeakMem() const { return PeakMem; }
-  std::string getName() const { return Name; }
-
+  explicit Timer(StringRef N) : TG(0) { init(N); }
+  Timer(StringRef N, TimerGroup &tg) : TG(0) { init(N, tg); }
+  Timer(const Timer &RHS) : TG(0) {
+    assert(RHS.TG == 0 && "Can only copy uninitialized timers");
+  }
   const Timer &operator=(const Timer &T) {
-    if (&T < this) {
-      T.Lock.acquire();
-      Lock.acquire();
-    } else {
-      Lock.acquire();
-      T.Lock.acquire();
-    }
-    
-    Elapsed = T.Elapsed;
-    UserTime = T.UserTime;
-    SystemTime = T.SystemTime;
-    MemUsed = T.MemUsed;
-    PeakMem = T.PeakMem;
-    PeakMemBase = T.PeakMemBase;
-    Name = T.Name;
-    Started = T.Started;
-    assert(TG == T.TG && "Can only assign timers in the same TimerGroup!");
-    
-    if (&T < this) {
-      T.Lock.release();
-      Lock.release();
-    } else {
-      Lock.release();
-      T.Lock.release();
-    }
-    
+    assert(TG == 0 && T.TG == 0 && "Can only assign uninit timers");
     return *this;
   }
+  ~Timer();
 
-  // operator< - Allow sorting...
-  bool operator<(const Timer &T) const {
-    // Sort by Wall Time elapsed, as it is the only thing really accurate
-    return Elapsed < T.Elapsed;
-  }
-  bool operator>(const Timer &T) const { return T.operator<(*this); }
-
+  // Create an uninitialized timer, client must use 'init'.
+  explicit Timer() : TG(0) {}
+  void init(StringRef N);
+  void init(StringRef N, TimerGroup &tg);
+  
+  const std::string &getName() const { return Name; }
+  bool isInitialized() const { return TG != 0; }
+  
   /// startTimer - Start the timer running.  Time between calls to
   /// startTimer/stopTimer is counted by the Timer class.  Note that these calls
   /// must be correctly paired.
@@ -104,25 +118,8 @@ public:
   ///
   void stopTimer();
 
-  /// addPeakMemoryMeasurement - This method should be called whenever memory
-  /// usage needs to be checked.  It adds a peak memory measurement to the
-  /// currently active timers, which will be printed when the timer group prints
-  ///
-  static void addPeakMemoryMeasurement();
-
-  /// print - Print the current timer to standard error, and reset the "Started"
-  /// flag.
-  void print(const Timer &Total, raw_ostream &OS);
-
 private:
   friend class TimerGroup;
-
-  // Copy ctor, initialize with no TG member.
-  Timer(bool, const Timer &T);
-
-  /// sum - Add the time accumulated in the specified timer into this timer.
-  ///
-  void sum(const Timer &T);
 };
 
 
@@ -139,12 +136,10 @@ public:
     T->startTimer();
   }
   explicit TimeRegion(Timer *t) : T(t) {
-    if (T)
-      T->startTimer();
+    if (T) T->startTimer();
   }
   ~TimeRegion() {
-    if (T)
-      T->stopTimer();
+    if (T) T->stopTimer();
   }
 };
 
@@ -155,9 +150,8 @@ public:
 /// is primarily used for debugging and for hunting performance problems.
 ///
 struct NamedRegionTimer : public TimeRegion {
-  explicit NamedRegionTimer(const std::string &Name);
-  explicit NamedRegionTimer(const std::string &Name,
-                            const std::string &GroupName);
+  explicit NamedRegionTimer(StringRef Name);
+  explicit NamedRegionTimer(StringRef Name, StringRef GroupName);
 };
 
 
@@ -168,20 +162,29 @@ struct NamedRegionTimer : public TimeRegion {
 ///
 class TimerGroup {
   std::string Name;
-  unsigned NumTimers;
-  std::vector<Timer> TimersToPrint;
+  Timer *FirstTimer;   // First timer in the group.
+  std::vector<std::pair<TimeRecord, std::string> > TimersToPrint;
+  
+  TimerGroup **Prev, *Next; // Doubly linked list of TimerGroup's.
+  TimerGroup(const TimerGroup &TG);      // DO NOT IMPLEMENT
+  void operator=(const TimerGroup &TG);  // DO NOT IMPLEMENT
 public:
-  explicit TimerGroup(const std::string &name) : Name(name), NumTimers(0) {}
-  ~TimerGroup() {
-    assert(NumTimers == 0 &&
-           "TimerGroup destroyed before all contained timers!");
-  }
+  explicit TimerGroup(StringRef name);
+  ~TimerGroup();
 
+  void setName(StringRef name) { Name.assign(name.begin(), name.end()); }
+
+  /// print - Print any started timers in this group and zero them.
+  void print(raw_ostream &OS);
+  
+  /// printAll - This static method prints all timers and clears them all out.
+  static void printAll(raw_ostream &OS);
+  
 private:
   friend class Timer;
-  void addTimer();
-  void removeTimer();
-  void addTimerToPrint(const Timer &T);
+  void addTimer(Timer &T);
+  void removeTimer(Timer &T);
+  void PrintQueuedTimers(raw_ostream &OS);
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Support/ValueHandle.h b/include/llvm/Support/ValueHandle.h
index 82c3caeb3ad6..130a620ab2fa 100644
--- a/include/llvm/Support/ValueHandle.h
+++ b/include/llvm/Support/ValueHandle.h
@@ -284,8 +284,7 @@ class TrackingVH : public ValueHandleBase {
     Value *VP = ValueHandleBase::getValPtr();
 
     // Null is always ok.
-    if (!VP)
-        return;
+    if (!VP) return;
 
     // Check that this value is valid (i.e., it hasn't been deleted). We
     // explicitly delay this check until access to avoid requiring clients to be
@@ -302,7 +301,7 @@ class TrackingVH : public ValueHandleBase {
 
   ValueTy *getValPtr() const {
     CheckValidity();
-    return static_cast<ValueTy*>(ValueHandleBase::getValPtr());
+    return (ValueTy*)ValueHandleBase::getValPtr();
   }
   void setValPtr(ValueTy *P) {
     CheckValidity();
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 0f227ccc3f80..e0de80f1d435 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -89,6 +89,7 @@ public:
 
   /// has_error - Return the value of the flag in this raw_ostream indicating
   /// whether an output error has been encountered.
+  /// This doesn't implicitly flush any pending output.
   bool has_error() const {
     return Error;
   }
diff --git a/include/llvm/System/Memory.h b/include/llvm/System/Memory.h
index 69251dd2bfbd..01bcab1f0070 100644
--- a/include/llvm/System/Memory.h
+++ b/include/llvm/System/Memory.h
@@ -27,7 +27,7 @@ namespace sys {
   /// @brief Memory block abstraction.
   class MemoryBlock {
   public:
-    MemoryBlock() { }
+    MemoryBlock() : Address(0), Size(0) { }
     MemoryBlock(void *addr, size_t size) : Address(addr), Size(size) { }
     void *base() const { return Address; }
     size_t size() const { return Size; }
diff --git a/include/llvm/Target/TargetAsmBackend.h b/include/llvm/Target/TargetAsmBackend.h
index bb501cc5226b..f350ecc410ac 100644
--- a/include/llvm/Target/TargetAsmBackend.h
+++ b/include/llvm/Target/TargetAsmBackend.h
@@ -15,8 +15,12 @@
 namespace llvm {
 class MCAsmFixup;
 class MCDataFragment;
+class MCInst;
+class MCInstFragment;
 class MCObjectWriter;
 class MCSection;
+template<typename T>
+class SmallVectorImpl;
 class Target;
 class raw_ostream;
 
@@ -95,6 +99,27 @@ public:
   /// fixup kind as appropriate.
   virtual void ApplyFixup(const MCAsmFixup &Fixup, MCDataFragment &Fragment,
                           uint64_t Value) const = 0;
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \arg Inst - The instruction to test.
+  /// \arg Fixups - The actual fixups this instruction encoded to, for potential
+  /// use by the target backend.
+  virtual bool MayNeedRelaxation(const MCInst &Inst,
+                           const SmallVectorImpl<MCAsmFixup> &Fixups) const = 0;
+
+  /// RelaxInstruction - Relax the instruction in the given fragment to the next
+  /// wider instruction.
+  virtual void RelaxInstruction(const MCInstFragment *IF,
+                                MCInst &Res) const = 0;
+
+  /// WriteNopData - Write an (optimal) nop sequence of Count bytes to the given
+  /// output. If the target cannot generate such a sequence, it should return an
+  /// error.
+  ///
+  /// \return - True on success.
+  virtual bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const = 0;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Target/TargetInstrDesc.h b/include/llvm/Target/TargetInstrDesc.h
index 9efb6833fb86..adc37e16e45f 100644
--- a/include/llvm/Target/TargetInstrDesc.h
+++ b/include/llvm/Target/TargetInstrDesc.h
@@ -204,6 +204,16 @@ public:
     return ImplicitUses;
   }
   
+  /// getNumImplicitUses - Return the number of implicit uses this instruction
+  /// has.
+  unsigned getNumImplicitUses() const {
+    if (ImplicitUses == 0) return 0;
+    unsigned i = 0;
+    for (; ImplicitUses[i]; ++i) /*empty*/;
+    return i;
+  }
+  
+  
   /// getImplicitDefs - Return a list of registers that are potentially
   /// written by any instance of this machine instruction.  For example, on X86,
   /// many instructions implicitly set the flags register.  In this case, they
@@ -218,6 +228,15 @@ public:
     return ImplicitDefs;
   }
   
+  /// getNumImplicitDefs - Return the number of implicit defs this instruction
+  /// has.
+  unsigned getNumImplicitDefs() const {
+    if (ImplicitDefs == 0) return 0;
+    unsigned i = 0;
+    for (; ImplicitDefs[i]; ++i) /*empty*/;
+    return i;
+  }
+  
   /// hasImplicitUseOfPhysReg - Return true if this instruction implicitly
   /// uses the specified physical register.
   bool hasImplicitUseOfPhysReg(unsigned Reg) const {
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index da0f68606c6f..b0534ddaa5e5 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -469,29 +469,6 @@ public:
        getIndexedStoreAction(IdxMode, VT) == Custom);
   }
 
-  /// getConvertAction - Return how the conversion should be treated:
-  /// either it is legal, needs to be promoted to a larger size, needs to be
-  /// expanded to some other code sequence, or the target has a custom expander
-  /// for it.
-  LegalizeAction
-  getConvertAction(EVT FromVT, EVT ToVT) const {
-    assert((unsigned)FromVT.getSimpleVT().SimpleTy <
-              array_lengthof(ConvertActions) &&
-           (unsigned)ToVT.getSimpleVT().SimpleTy <
-              sizeof(ConvertActions[0])*4 &&
-           "Table isn't big enough!");
-    return (LegalizeAction)((ConvertActions[FromVT.getSimpleVT().SimpleTy] >>
-                             (2*ToVT.getSimpleVT().SimpleTy)) & 3);
-  }
-
-  /// isConvertLegal - Return true if the specified conversion is legal
-  /// on this target.
-  bool isConvertLegal(EVT FromVT, EVT ToVT) const {
-    return isTypeLegal(FromVT) && isTypeLegal(ToVT) &&
-      (getConvertAction(FromVT, ToVT) == Legal ||
-       getConvertAction(FromVT, ToVT) == Custom);
-  }
-
   /// getCondCodeAction - Return how the condition code should be treated:
   /// either it is legal, needs to be expanded to some other code sequence,
   /// or the target has a custom expander for it.
@@ -545,7 +522,7 @@ public:
   /// counterpart (e.g. structs), otherwise it will assert.
   EVT getValueType(const Type *Ty, bool AllowUnknown = false) const {
     EVT VT = EVT::getEVT(Ty, AllowUnknown);
-    return VT == MVT:: iPTR ? PointerTy : VT;
+    return VT == MVT::iPTR ? PointerTy : VT;
   }
 
   /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -657,11 +634,14 @@ public:
 
   /// getOptimalMemOpType - Returns the target specific optimal type for load
   /// and store operations as a result of memset, memcpy, and memmove lowering.
-  /// It returns EVT::Other if SelectionDAG should be responsible for
-  /// determining it.
-  virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                  bool isSrcConst, bool isSrcStr,
-                                  SelectionDAG &DAG) const {
+  /// If DstAlign is zero that means it's safe to destination alignment can
+  /// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't
+  /// a need to check it against alignment requirement, probably because the
+  /// source does not need to be loaded. It returns EVT::Other if SelectionDAG
+  /// should be responsible for determining it.
+  virtual EVT getOptimalMemOpType(uint64_t Size,
+                                  unsigned DstAlign, unsigned SrcAlign,
+                                  bool SafeToUseFP, SelectionDAG &DAG) const {
     return MVT::Other;
   }
   
@@ -990,7 +970,7 @@ protected:
   }
   
   /// setLoadExtAction - Indicate that the specified load with extension does
-  /// not work with the with specified type and indicate what to do about it.
+  /// not work with the specified type and indicate what to do about it.
   void setLoadExtAction(unsigned ExtType, MVT VT,
                       LegalizeAction Action) {
     assert((unsigned)VT.SimpleTy*2 < 63 &&
@@ -1001,7 +981,7 @@ protected:
   }
   
   /// setTruncStoreAction - Indicate that the specified truncating store does
-  /// not work with the with specified type and indicate what to do about it.
+  /// not work with the specified type and indicate what to do about it.
   void setTruncStoreAction(MVT ValVT, MVT MemVT,
                            LegalizeAction Action) {
     assert((unsigned)ValVT.SimpleTy < array_lengthof(TruncStoreActions) &&
@@ -1012,7 +992,7 @@ protected:
   }
 
   /// setIndexedLoadAction - Indicate that the specified indexed load does or
-  /// does not work with the with specified type and indicate what to do abort
+  /// does not work with the specified type and indicate what to do abort
   /// it. NOTE: All indexed mode loads are initialized to Expand in
   /// TargetLowering.cpp
   void setIndexedLoadAction(unsigned IdxMode, MVT VT,
@@ -1024,7 +1004,7 @@ protected:
   }
   
   /// setIndexedStoreAction - Indicate that the specified indexed store does or
-  /// does not work with the with specified type and indicate what to do about
+  /// does not work with the specified type and indicate what to do about
   /// it. NOTE: All indexed mode stores are initialized to Expand in
   /// TargetLowering.cpp
   void setIndexedStoreAction(unsigned IdxMode, MVT VT,
@@ -1035,17 +1015,6 @@ protected:
     IndexedModeActions[(unsigned)VT.SimpleTy][1][IdxMode] = (uint8_t)Action;
   }
   
-  /// setConvertAction - Indicate that the specified conversion does or does
-  /// not work with the with specified type and indicate what to do about it.
-  void setConvertAction(MVT FromVT, MVT ToVT,
-                        LegalizeAction Action) {
-    assert((unsigned)FromVT.SimpleTy < array_lengthof(ConvertActions) &&
-           (unsigned)ToVT.SimpleTy < MVT::LAST_VALUETYPE &&
-           "Table isn't big enough!");
-    ConvertActions[FromVT.SimpleTy] &= ~(uint64_t(3UL)  << ToVT.SimpleTy*2);
-    ConvertActions[FromVT.SimpleTy] |= (uint64_t)Action << ToVT.SimpleTy*2;
-  }
-
   /// setCondCodeAction - Indicate that the specified condition code is or isn't
   /// supported on the target and indicate what to do about it.
   void setCondCodeAction(ISD::CondCode CC, MVT VT,
@@ -1674,13 +1643,6 @@ private:
   /// represents the various modes for load store.
   uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][2][ISD::LAST_INDEXED_MODE];
   
-  /// ConvertActions - For each conversion from source type to destination type,
-  /// keep a LegalizeAction that indicates how instruction selection should
-  /// deal with the conversion.
-  /// Currently, this is used only for floating->floating conversions
-  /// (FP_EXTEND and FP_ROUND).
-  uint64_t ConvertActions[MVT::LAST_VALUETYPE];
-
   /// CondCodeActions - For each condition code (ISD::CondCode) keep a
   /// LegalizeAction that indicates how instruction selection should
   /// deal with the condition code.
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index b239a306761f..d1d665f870ff 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -192,7 +192,7 @@ public:
                                    formatted_raw_ostream &,
                                    CodeGenFileType,
                                    CodeGenOpt::Level,
-                                   bool DisableVerify = true) {
+                                   bool = true) {
     return true;
   }
 
@@ -205,7 +205,7 @@ public:
   virtual bool addPassesToEmitMachineCode(PassManagerBase &,
                                           JITCodeEmitter &,
                                           CodeGenOpt::Level,
-                                          bool DisableVerify = true) {
+                                          bool = true) {
     return true;
   }
 
@@ -216,7 +216,7 @@ public:
   virtual bool addPassesToEmitWholeFile(PassManager &, formatted_raw_ostream &,
                                         CodeGenFileType,
                                         CodeGenOpt::Level,
-                                        bool DisableVerify = true) {
+                                        bool = true) {
     return true;
   }
 };
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index e56d886a0a1a..58ccfbacc6f7 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -92,6 +92,10 @@ def SDTIntBinOp : SDTypeProfile<1, 2, [     // add, and, or, xor, udiv, etc.
 def SDTIntShiftOp : SDTypeProfile<1, 2, [   // shl, sra, srl
   SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>
 ]>;
+def SDTIntBinHiLoOp : SDTypeProfile<2, 2, [ // mulhi, mullo, sdivrem, udivrem
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,SDTCisInt<0>
+]>;
+
 def SDTFPBinOp : SDTypeProfile<1, 2, [      // fadd, fmul, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>
 ]>;
@@ -235,7 +239,6 @@ class SDNode<string opcode, SDTypeProfile typeprof,
 // Special TableGen-recognized dag nodes
 def set;
 def implicit;
-def parallel;
 def node;
 def srcvalue;
 
@@ -282,10 +285,14 @@ def mul        : SDNode<"ISD::MUL"       , SDTIntBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def mulhs      : SDNode<"ISD::MULHS"     , SDTIntBinOp, [SDNPCommutative]>;
 def mulhu      : SDNode<"ISD::MULHU"     , SDTIntBinOp, [SDNPCommutative]>;
+def smullohi   : SDNode<"ISD::SMUL_LOHI" , SDTIntBinHiLoOp, [SDNPCommutative]>;
+def umullohi   : SDNode<"ISD::UMUL_LOHI" , SDTIntBinHiLoOp, [SDNPCommutative]>;
 def sdiv       : SDNode<"ISD::SDIV"      , SDTIntBinOp>;
 def udiv       : SDNode<"ISD::UDIV"      , SDTIntBinOp>;
 def srem       : SDNode<"ISD::SREM"      , SDTIntBinOp>;
 def urem       : SDNode<"ISD::UREM"      , SDTIntBinOp>;
+def sdivrem    : SDNode<"ISD::SDIVREM"   , SDTIntBinHiLoOp>;
+def udivrem    : SDNode<"ISD::UDIVREM"   , SDTIntBinHiLoOp>;
 def srl        : SDNode<"ISD::SRL"       , SDTIntShiftOp>;
 def sra        : SDNode<"ISD::SRA"       , SDTIntShiftOp>;
 def shl        : SDNode<"ISD::SHL"       , SDTIntShiftOp>;
@@ -485,22 +492,15 @@ def vtFP       : PatLeaf<(vt),  [{ return N->getVT().isFloatingPoint(); }]>;
 def immAllOnesV: PatLeaf<(build_vector), [{
   return ISD::isBuildVectorAllOnes(N);
 }]>;
-def immAllOnesV_bc: PatLeaf<(bitconvert), [{
-  return ISD::isBuildVectorAllOnes(N);
-}]>;
 def immAllZerosV: PatLeaf<(build_vector), [{
   return ISD::isBuildVectorAllZeros(N);
 }]>;
-def immAllZerosV_bc: PatLeaf<(bitconvert), [{
-  return ISD::isBuildVectorAllZeros(N);
-}]>;
 
 
 
 // Other helper fragments.
 def not  : PatFrag<(ops node:$in), (xor node:$in, -1)>;
 def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>;
-def vnot_conv : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV_bc)>;
 def ineg : PatFrag<(ops node:$in), (sub 0, node:$in)>;
 
 // load fragments.
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index ac5f07e2d384..d278672a1f02 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -49,6 +49,12 @@ namespace llvm {
   Value *EmitMemCpy(Value *Dst, Value *Src, Value *Len,
                     unsigned Align, IRBuilder<> &B, const TargetData *TD);
 
+  /// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder.
+  /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
+  /// are pointers.
+  Value *EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+                       IRBuilder<> &B, const TargetData *TD);
+
   /// EmitMemMove - Emit a call to the memmove function to the builder.  This
   /// always expects that the size has type 'intptr_t' and Dst/Src are pointers.
   Value *EmitMemMove(Value *Dst, Value *Src, Value *Len,
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index 927e156abfb5..b29b749e8d9a 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -27,22 +27,28 @@ namespace llvm {
 /// transformation wants to rewrite a set of uses of one value with uses of a
 /// set of values.
 class SSAUpdater {
+public:
+  class BBInfo;
+
+private:
   /// AvailableVals - This keeps track of which value to use on a per-block
-  /// basis.  When we insert PHI nodes, we keep track of them here.  We use
-  /// TrackingVH's for the value of the map because we RAUW PHI nodes when we
-  /// eliminate them, and want the TrackingVH's to track this.
-  //typedef DenseMap<BasicBlock*, TrackingVH<Value> > AvailableValsTy;
+  /// basis.  When we insert PHI nodes, we keep track of them here.
+  //typedef DenseMap<BasicBlock*, Value*> AvailableValsTy;
   void *AV;
 
   /// PrototypeValue is an arbitrary representative value, which we derive names
   /// and a type for PHI nodes.
   Value *PrototypeValue;
 
-  /// IncomingPredInfo - We use this as scratch space when doing our recursive
-  /// walk.  This should only be used in GetValueInBlockInternal, normally it
-  /// should be empty.
-  //std::vector<std::pair<BasicBlock*, TrackingVH<Value> > > IncomingPredInfo;
-  void *IPI;
+  /// BBMap - The GetValueAtEndOfBlock method maintains this mapping from
+  /// basic blocks to BBInfo structures.
+  /// typedef DenseMap<BasicBlock*, BBInfo*> BBMapTy;
+  void *BM;
+
+  /// Allocator - The GetValueAtEndOfBlock method uses this BumpPtrAllocator to
+  /// hold its internal data.  The allocator and its storage is created and
+  /// discarded for each invocation of GetValueAtEndOfBlock.
+  void *BPA;
 
   /// InsertedPHIs - If this is non-null, the SSAUpdater adds all PHI nodes that
   /// it creates to the vector.
@@ -99,6 +105,14 @@ public:
 
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
+  void FindPHIPlacement(BasicBlock *BB, BBInfo *Info, bool &Changed,
+                        unsigned Counter);
+  void FindAvailableVal(BasicBlock *BB, BBInfo *Info, unsigned Counter);
+  void FindExistingPHI(BasicBlock *BB);
+  bool CheckIfPHIMatches(PHINode *PHI);
+  void RecordMatchingPHI(PHINode *PHI);
+  void ClearPHITags(PHINode *PHI);
+
   void operator=(const SSAUpdater&); // DO NOT IMPLEMENT
   SSAUpdater(const SSAUpdater&);     // DO NOT IMPLEMENT
 };
diff --git a/include/llvm/Type.h b/include/llvm/Type.h
index d09913a1df33..df3a16c59261 100644
--- a/include/llvm/Type.h
+++ b/include/llvm/Type.h
@@ -548,9 +548,11 @@ template <> struct GraphTraits<const Type*> {
   }
 };
 
-template <> inline bool isa_impl<PointerType, Type>(const Type &Ty) {
-  return Ty.getTypeID() == Type::PointerTyID;
-}
+template <> struct isa_impl<PointerType, Type> {
+  static inline bool doit(const Type &Ty) {
+    return Ty.getTypeID() == Type::PointerTyID;
+  }
+};
 
 raw_ostream &operator<<(raw_ostream &OS, const Type &T);
 
diff --git a/include/llvm/Value.h b/include/llvm/Value.h
index d06cbc05c6a5..bc25a0f40144 100644
--- a/include/llvm/Value.h
+++ b/include/llvm/Value.h
@@ -157,13 +157,13 @@ public:
   // Methods for handling the chain of uses of this Value.
   //
   typedef value_use_iterator<User>       use_iterator;
-  typedef value_use_iterator<const User> use_const_iterator;
+  typedef value_use_iterator<const User> const_use_iterator;
 
   bool               use_empty() const { return UseList == 0; }
   use_iterator       use_begin()       { return use_iterator(UseList); }
-  use_const_iterator use_begin() const { return use_const_iterator(UseList); }
+  const_use_iterator use_begin() const { return const_use_iterator(UseList); }
   use_iterator       use_end()         { return use_iterator(0);   }
-  use_const_iterator use_end()   const { return use_const_iterator(0);   }
+  const_use_iterator use_end()   const { return const_use_iterator(0);   }
   User              *use_back()        { return *use_begin(); }
   const User        *use_back()  const { return *use_begin(); }
 
@@ -172,7 +172,7 @@ public:
   /// traversing the whole use list.
   ///
   bool hasOneUse() const {
-    use_const_iterator I = use_begin(), E = use_end();
+    const_use_iterator I = use_begin(), E = use_end();
     if (I == E) return false;
     return ++I == E;
   }
@@ -324,39 +324,67 @@ void Use::set(Value *V) {
 // isa - Provide some specializations of isa so that we don't have to include
 // the subtype header files to test to see if the value is a subclass...
 //
-template <> inline bool isa_impl<Constant, Value>(const Value &Val) {
-  return Val.getValueID() >= Value::ConstantFirstVal &&
-         Val.getValueID() <= Value::ConstantLastVal;
-}
-template <> inline bool isa_impl<Argument, Value>(const Value &Val) {
-  return Val.getValueID() == Value::ArgumentVal;
-}
-template <> inline bool isa_impl<InlineAsm, Value>(const Value &Val) {
-  return Val.getValueID() == Value::InlineAsmVal;
-}
-template <> inline bool isa_impl<Instruction, Value>(const Value &Val) {
-  return Val.getValueID() >= Value::InstructionVal;
-}
-template <> inline bool isa_impl<BasicBlock, Value>(const Value &Val) {
-  return Val.getValueID() == Value::BasicBlockVal;
-}
-template <> inline bool isa_impl<Function, Value>(const Value &Val) {
-  return Val.getValueID() == Value::FunctionVal;
-}
-template <> inline bool isa_impl<GlobalVariable, Value>(const Value &Val) {
-  return Val.getValueID() == Value::GlobalVariableVal;
-}
-template <> inline bool isa_impl<GlobalAlias, Value>(const Value &Val) {
-  return Val.getValueID() == Value::GlobalAliasVal;
-}
-template <> inline bool isa_impl<GlobalValue, Value>(const Value &Val) {
-  return isa<GlobalVariable>(Val) || isa<Function>(Val) ||
-         isa<GlobalAlias>(Val);
-}
-template <> inline bool isa_impl<MDNode, Value>(const Value &Val) {
-  return Val.getValueID() == Value::MDNodeVal;
-}
-  
+template <> struct isa_impl<Constant, Value> {
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() >= Value::ConstantFirstVal &&
+      Val.getValueID() <= Value::ConstantLastVal;
+  }
+};
+
+template <> struct isa_impl<Argument, Value> {
+  static inline bool doit (const Value &Val) {
+    return Val.getValueID() == Value::ArgumentVal;
+  }
+};
+
+template <> struct isa_impl<InlineAsm, Value> { 
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() == Value::InlineAsmVal;
+  }
+};
+
+template <> struct isa_impl<Instruction, Value> { 
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() >= Value::InstructionVal;
+  }
+};
+
+template <> struct isa_impl<BasicBlock, Value> { 
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() == Value::BasicBlockVal;
+  }
+};
+
+template <> struct isa_impl<Function, Value> { 
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() == Value::FunctionVal;
+  }
+};
+
+template <> struct isa_impl<GlobalVariable, Value> { 
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() == Value::GlobalVariableVal;
+  }
+};
+
+template <> struct isa_impl<GlobalAlias, Value> { 
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() == Value::GlobalAliasVal;
+  }
+};
+
+template <> struct isa_impl<GlobalValue, Value> { 
+  static inline bool doit(const Value &Val) {
+    return isa<GlobalVariable>(Val) || isa<Function>(Val) ||
+      isa<GlobalAlias>(Val);
+  }
+};
+
+template <> struct isa_impl<MDNode, Value> { 
+  static inline bool doit(const Value &Val) {
+    return Val.getValueID() == Value::MDNodeVal;
+  }
+};
   
 // Value* is only 4-byte aligned.
 template<>
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 8767c1814172..047825884ef3 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -49,7 +49,7 @@ bool llvm::PointerMayBeCaptured(const Value *V,
   SmallSet<Use*, Threshold> Visited;
   int Count = 0;
 
-  for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
+  for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
        UI != UE; ++UI) {
     // If there are lots of uses, conservatively say that the value
     // is captured to avoid taking too much compile time.
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
index fda69ac39a22..f12552d9e400 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/Analysis/DebugInfo.cpp
@@ -96,9 +96,8 @@ DIDescriptor DIDescriptor::getDescriptorField(unsigned Elt) const {
   if (DbgNode == 0)
     return DIDescriptor();
 
-  if (Elt < DbgNode->getNumOperands() && DbgNode->getOperand(Elt))
-    return DIDescriptor(dyn_cast<MDNode>(DbgNode->getOperand(Elt)));
-
+  if (Elt < DbgNode->getNumOperands())
+    return DIDescriptor(dyn_cast_or_null<MDNode>(DbgNode->getOperand(Elt)));
   return DIDescriptor();
 }
 
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 0e333d177450..0f39f44a0099 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -17,12 +17,13 @@
 
 #define DEBUG_TYPE "cgscc-passmgr"
 #include "llvm/CallGraphSCCPass.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Function.h"
+#include "llvm/PassManagers.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/PassManagers.h"
-#include "llvm/Function.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -102,9 +103,10 @@ bool CGPassManager::RunPassOnSCC(Pass *P, std::vector<CallGraphNode*> &CurSCC,
       CallGraphUpToDate = true;
     }
 
-    Timer *T = StartPassTimer(CGSP);
-    Changed = CGSP->runOnSCC(CurSCC);
-    StopPassTimer(CGSP, T);
+    {
+      TimeRegion PassTimer(getPassTimer(CGSP));
+      Changed = CGSP->runOnSCC(CurSCC);
+    }
     
     // After the CGSCCPass is done, when assertions are enabled, use
     // RefreshCallGraph to verify that the callgraph was correctly updated.
@@ -125,9 +127,8 @@ bool CGPassManager::RunPassOnSCC(Pass *P, std::vector<CallGraphNode*> &CurSCC,
   for (unsigned i = 0, e = CurSCC.size(); i != e; ++i) {
     if (Function *F = CurSCC[i]->getFunction()) {
       dumpPassInfo(P, EXECUTION_MSG, ON_FUNCTION_MSG, F->getName());
-      Timer *T = StartPassTimer(FPP);
+      TimeRegion PassTimer(getPassTimer(FPP));
       Changed |= FPP->runOnFunction(*F);
-      StopPassTimer(FPP, T);
     }
   }
   
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 7b43089d592c..b14afa323134 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -257,7 +257,7 @@ bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
       // Make sure that this is just the function being called, not that it is
       // passing into the function.
-      for (unsigned i = 3, e = II->getNumOperands(); i != e; ++i)
+      for (unsigned i = 0, e = II->getNumOperands() - 3; i != e; ++i)
         if (II->getOperand(i) == V) return true;
     } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
       if (CE->getOpcode() == Instruction::GetElementPtr ||
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 5b8b53495c40..c599e907eb60 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -255,9 +255,11 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
   Function *Caller = TheCall->getParent()->getParent();
 
   // Don't inline functions which can be redefined at link-time to mean
-  // something else.  Don't inline functions marked noinline.
+  // something else.  Don't inline functions marked noinline or call sites
+  // marked noinline.
   if (Callee->mayBeOverridden() ||
-      Callee->hasFnAttr(Attribute::NoInline) || NeverInline.count(Callee))
+      Callee->hasFnAttr(Attribute::NoInline) || NeverInline.count(Callee) ||
+      CS.isNoInline())
     return llvm::InlineCost::getNever();
 
   // InlineCost - This value measures how good of an inline candidate this call
diff --git a/lib/Analysis/LiveValues.cpp b/lib/Analysis/LiveValues.cpp
index 1b91d93c0e1f..23964ffc457e 100644
--- a/lib/Analysis/LiveValues.cpp
+++ b/lib/Analysis/LiveValues.cpp
@@ -125,7 +125,7 @@ LiveValues::Memo &LiveValues::compute(const Value *V) {
   bool LiveOutOfDefBB = false;
 
   // Examine each use of the value.
-  for (Value::use_const_iterator I = V->use_begin(), E = V->use_end();
+  for (Value::const_use_iterator I = V->use_begin(), E = V->use_end();
        I != E; ++I) {
     const User *U = *I;
     const BasicBlock *UseBB = cast<Instruction>(U)->getParent();
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 2d613f6de71a..e2d2c2bc9d88 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/Timer.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -221,22 +222,22 @@ bool LPPassManager::runOnFunction(Function &F) {
       LoopPass *P = (LoopPass*)getContainedPass(Index);
 
       dumpPassInfo(P, EXECUTION_MSG, ON_LOOP_MSG,
-                   CurrentLoop->getHeader()->getNameStr());
+                   CurrentLoop->getHeader()->getName());
       dumpRequiredSet(P);
 
       initializeAnalysisImpl(P);
 
       {
         PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader());
-        Timer *T = StartPassTimer(P);
+        TimeRegion PassTimer(getPassTimer(P));
+
         Changed |= P->runOnLoop(CurrentLoop, *this);
-        StopPassTimer(P, T);
       }
 
       if (Changed)
         dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG,
                      skipThisLoop ? "<deleted>" :
-                                    CurrentLoop->getHeader()->getNameStr());
+                                    CurrentLoop->getHeader()->getName());
       dumpPreservedSet(P);
 
       if (!skipThisLoop) {
@@ -245,9 +246,10 @@ bool LPPassManager::runOnFunction(Function &F) {
         // is a function pass and it's really expensive to verify every
         // loop in the function every time. That level of checking can be
         // enabled with the -verify-loop-info option.
-        Timer *T = StartPassTimer(LI);
-        CurrentLoop->verifyLoop();
-        StopPassTimer(LI, T);
+        {
+          TimeRegion PassTimer(getPassTimer(LI));
+          CurrentLoop->verifyLoop();
+        }
 
         // Then call the regular verifyAnalysis functions.
         verifyPreservedAnalysis(P);
@@ -257,7 +259,7 @@ bool LPPassManager::runOnFunction(Function &F) {
       recordAvailableAnalysis(P);
       removeDeadPasses(P,
                        skipThisLoop ? "<deleted>" :
-                                      CurrentLoop->getHeader()->getNameStr(),
+                                      CurrentLoop->getHeader()->getName(),
                        ON_LOOP_MSG);
 
       if (skipThisLoop)
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 297b5880c418..89f9743daa54 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -139,7 +139,7 @@ const PointerType *llvm::getMallocType(const CallInst *CI) {
   unsigned NumOfBitCastUses = 0;
 
   // Determine if CallInst has a bitcast use.
-  for (Value::use_const_iterator UI = CI->use_begin(), E = CI->use_end();
+  for (Value::const_use_iterator UI = CI->use_begin(), E = CI->use_end();
        UI != E; )
     if (const BitCastInst *BCI = dyn_cast<BitCastInst>(*UI++)) {
       MallocType = cast<PointerType>(BCI->getDestTy());
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
index bce6b310ce06..da4ce4769262 100644
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ b/lib/Analysis/ProfileEstimatorPass.cpp
@@ -398,7 +398,7 @@ bool ProfileEstimatorPass::runOnFunction(Function &F) {
     for (Function::const_iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
       const BasicBlock *BB = &(*FI);
       BlockInformation[&F][BB] = 0;
-      pred_const_iterator predi = pred_begin(BB), prede = pred_end(BB);
+      const_pred_iterator predi = pred_begin(BB), prede = pred_end(BB);
       if (predi == prede) {
         Edge e = getEdge(0,BB);
         setEdgeWeight(e,0);
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
index 85531be76add..662576e1d0a4 100644
--- a/lib/Analysis/ProfileInfo.cpp
+++ b/lib/Analysis/ProfileInfo.cpp
@@ -67,7 +67,7 @@ ProfileInfoT<Function,BasicBlock>::getExecutionCount(const BasicBlock *BB) {
 
   double Count = MissingValue;
 
-  pred_const_iterator PI = pred_begin(BB), PE = pred_end(BB);
+  const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
 
   // Are there zero predecessors of this block?
   if (PI == PE) {
@@ -508,7 +508,7 @@ bool ProfileInfoT<Function,BasicBlock>::
   // have no value
   double incount = 0;
   SmallSet<const BasicBlock*,8> pred_visited;
-  pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
   if (bbi==bbe) {
     Edge e = getEdge(0,BB);
     incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
@@ -582,7 +582,7 @@ bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *B
   double inWeight = 0;
   std::set<Edge> inMissing;
   std::set<const BasicBlock*> ProcessedPreds;
-  pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
   if (bbi == bbe) {
     readEdge(this,getEdge(0,BB),inWeight,inMissing);
   }
@@ -639,7 +639,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
 //         FI != FE; ++FI) {
 //      const BasicBlock* BB = &(*FI);
 //      {
-//        pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+//        const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
 //        if (NBB == End) {
 //          setEdgeWeight(getEdge(0,BB),0);
 //        }
@@ -779,7 +779,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
       // Calculate incoming flow.
       double iw = 0; unsigned inmissing = 0; unsigned incount = 0; unsigned invalid = 0;
       std::set<const BasicBlock *> Processed;
-      for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
            NBB != End; ++NBB) {
         if (Processed.insert(*NBB).second) {
           Edge e = getEdge(*NBB, BB);
@@ -869,7 +869,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
         if (getEdgeWeight(e) == MissingValue) {
           double iw = 0;
           std::set<const BasicBlock *> Processed;
-          for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+          for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
                NBB != End; ++NBB) {
             if (Processed.insert(*NBB).second) {
               Edge e = getEdge(*NBB, BB);
@@ -893,7 +893,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
       const BasicBlock *Dest;
       Path P;
       bool BackEdgeFound = false;
-      for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
            NBB != End; ++NBB) {
         Dest = GetPath(BB, *NBB, P, GetPathToDest | GetPathWithNewEdges);
         if (Dest == *NBB) {
@@ -935,7 +935,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
         // Calculate incoming flow.
         double iw = 0;
         std::set<const BasicBlock *> Processed;
-        for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+        for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
              NBB != End; ++NBB) {
           if (Processed.insert(*NBB).second) {
             Edge e = getEdge(*NBB, BB);
@@ -965,7 +965,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
     while(FI != FE && !FoundPath) {
       const BasicBlock *BB = *FI; ++FI;
 
-      for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
            NBB != End; ++NBB) {
         Edge e = getEdge(*NBB,BB);
         double w = getEdgeWeight(e);
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
index ac9ed524deef..8ea4ecf54f98 100644
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -119,7 +119,7 @@ void LoaderPass::recurseBasicBlock(const BasicBlock *BB) {
        bbi != bbe; ++bbi) {
     recurseBasicBlock(*bbi);
   }
-  for (pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+  for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
        bbi != bbe; ++bbi) {
     recurseBasicBlock(*bbi);
   }
diff --git a/lib/Analysis/ProfileVerifierPass.cpp b/lib/Analysis/ProfileVerifierPass.cpp
index a2ddc8e1f33f..5d87e14a97b4 100644
--- a/lib/Analysis/ProfileVerifierPass.cpp
+++ b/lib/Analysis/ProfileVerifierPass.cpp
@@ -96,8 +96,8 @@ namespace llvm {
     double inWeight = 0;
     int inCount = 0;
     std::set<const BType*> ProcessedPreds;
-    for ( pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-          bbi != bbe; ++bbi ) {
+    for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+         bbi != bbe; ++bbi ) {
       if (ProcessedPreds.insert(*bbi).second) {
         typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(*bbi,BB);
         double EdgeWeight = PI->getEdgeWeight(E);
@@ -242,7 +242,7 @@ namespace llvm {
 
     // Read predecessors.
     std::set<const BType*> ProcessedPreds;
-    pred_const_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
+    const_pred_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
     // If there are none, check for (0,BB) edge.
     if (bpi == bpe) {
       DI.inWeight += ReadOrAssert(PI->getEdge(0,BB));
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 138cdc64cd0c..2e18ceac525e 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1268,19 +1268,8 @@ Value *SCEVExpander::expand(const SCEV *S) {
        L = L->getParentLoop())
     if (S->isLoopInvariant(L)) {
       if (!L) break;
-      if (BasicBlock *Preheader = L->getLoopPreheader()) {
+      if (BasicBlock *Preheader = L->getLoopPreheader())
         InsertPt = Preheader->getTerminator();
-        BasicBlock::iterator IP = InsertPt;
-        // Back past any debug info instructions.  Sometimes we inserted
-        // something earlier before debug info but after any real instructions.
-        // This should behave the same as if debug info was not present.
-        while (IP != Preheader->begin()) {
-          --IP;
-          if (!isa<DbgInfoIntrinsic>(IP))
-            break;
-          InsertPt = IP;
-        }
-      }
     } else {
       // If the SCEV is computable at this level, insert it into the header
       // after the PHIs (and after any other instructions that we've inserted
diff --git a/lib/Archive/ArchiveWriter.cpp b/lib/Archive/ArchiveWriter.cpp
index 58fbbf44141c..a02601a4aadc 100644
--- a/lib/Archive/ArchiveWriter.cpp
+++ b/lib/Archive/ArchiveWriter.cpp
@@ -220,7 +220,7 @@ Archive::writeMember(
   }
 
   // Now that we have the data in memory, update the
-  // symbol table if its a bitcode file.
+  // symbol table if it's a bitcode file.
   if (CreateSymbolTable && member.isBitcode()) {
     std::vector<std::string> symbols;
     std::string FullMemberName = archPath.str() + "(" + member.getPath().str()
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index 3057992231b5..70f1cfdbfd8c 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h
@@ -55,7 +55,7 @@ namespace llvm {
     typedef SMLoc LocTy;
     LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); }
     lltok::Kind getKind() const { return CurKind; }
-    const std::string getStrVal() const { return StrVal; }
+    const std::string &getStrVal() const { return StrVal; }
     const Type *getTyVal() const { return TyVal; }
     unsigned getUIntVal() const { return UIntVal; }
     const APSInt &getAPSIntVal() const { return APSIntVal; }
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 8083a07fdbcd..cdad0770a38a 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -39,6 +39,27 @@ bool LLParser::Run() {
 /// ValidateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
 bool LLParser::ValidateEndOfModule() {
+  // Handle any instruction metadata forward references.
+  if (!ForwardRefInstMetadata.empty()) {
+    for (DenseMap<Instruction*, std::vector<MDRef> >::iterator
+         I = ForwardRefInstMetadata.begin(), E = ForwardRefInstMetadata.end();
+         I != E; ++I) {
+      Instruction *Inst = I->first;
+      const std::vector<MDRef> &MDList = I->second;
+      
+      for (unsigned i = 0, e = MDList.size(); i != e; ++i) {
+        unsigned SlotNo = MDList[i].MDSlot;
+        
+        if (SlotNo >= NumberedMetadata.size() || NumberedMetadata[SlotNo] == 0)
+          return Error(MDList[i].Loc, "use of undefined metadata '!" +
+                       utostr(SlotNo) + "'");
+        Inst->setMetadata(MDList[i].MDKind, NumberedMetadata[SlotNo]);
+      }
+    }
+    ForwardRefInstMetadata.clear();
+  }
+  
+  
   // Update auto-upgraded malloc calls to "malloc".
   // FIXME: Remove in LLVM 3.0.
   if (MallocF) {
@@ -472,18 +493,30 @@ bool LLParser::ParseMDString(MDString *&Result) {
 
 // MDNode:
 //   ::= '!' MDNodeNumber
+//
+/// This version of ParseMDNodeID returns the slot number and null in the case
+/// of a forward reference.
+bool LLParser::ParseMDNodeID(MDNode *&Result, unsigned &SlotNo) {
+  // !{ ..., !42, ... }
+  if (ParseUInt32(SlotNo)) return true;
+
+  // Check existing MDNode.
+  if (SlotNo < NumberedMetadata.size() && NumberedMetadata[SlotNo] != 0)
+    Result = NumberedMetadata[SlotNo];
+  else
+    Result = 0;
+  return false;
+}
+
 bool LLParser::ParseMDNodeID(MDNode *&Result) {
   // !{ ..., !42, ... }
   unsigned MID = 0;
-  if (ParseUInt32(MID)) return true;
+  if (ParseMDNodeID(Result, MID)) return true;
 
-  // Check existing MDNode.
-  if (MID < NumberedMetadata.size() && NumberedMetadata[MID] != 0) {
-    Result = NumberedMetadata[MID];
-    return false;
-  }
+  // If not a forward reference, just return it now.
+  if (Result) return false;
 
-  // Create MDNode forward reference.
+  // Otherwise, create MDNode forward reference.
 
   // FIXME: This is not unique enough!
   std::string FwdRefName = "llvm.mdnode.fwdref." + utostr(MID);
@@ -1078,9 +1111,7 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
 
 /// ParseInstructionMetadata
 ///   ::= !dbg !42 (',' !dbg !57)*
-bool LLParser::
-ParseInstructionMetadata(SmallVectorImpl<std::pair<unsigned,
-                                                 MDNode *> > &Result){
+bool LLParser::ParseInstructionMetadata(Instruction *Inst) {
   do {
     if (Lex.getKind() != lltok::MetadataVar)
       return TokError("expected metadata after comma");
@@ -1089,12 +1120,21 @@ ParseInstructionMetadata(SmallVectorImpl<std::pair<unsigned,
     Lex.Lex();
 
     MDNode *Node;
+    unsigned NodeID;
+    SMLoc Loc = Lex.getLoc();
     if (ParseToken(lltok::exclaim, "expected '!' here") ||
-        ParseMDNodeID(Node))
+        ParseMDNodeID(Node, NodeID))
       return true;
 
     unsigned MDK = M->getMDKindID(Name.c_str());
-    Result.push_back(std::make_pair(MDK, Node));
+    if (Node) {
+      // If we got the node, add it to the instruction.
+      Inst->setMetadata(MDK, Node);
+    } else {
+      MDRef R = { Loc, MDK, NodeID };
+      // Otherwise, remember that this should be resolved later.
+      ForwardRefInstMetadata[Inst].push_back(R);
+    }
 
     // If this is the end of the list, we're done.
   } while (EatIfPresent(lltok::comma));
@@ -2896,22 +2936,17 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
       // With a normal result, we check to see if the instruction is followed by
       // a comma and metadata.
       if (EatIfPresent(lltok::comma))
-        if (ParseInstructionMetadata(MetadataOnInst))
+        if (ParseInstructionMetadata(Inst))
           return true;
       break;
     case InstExtraComma:
       // If the instruction parser ate an extra comma at the end of it, it
       // *must* be followed by metadata.
-      if (ParseInstructionMetadata(MetadataOnInst))
+      if (ParseInstructionMetadata(Inst))
         return true;
       break;        
     }
 
-    // Set metadata attached with this instruction.
-    for (unsigned i = 0, e = MetadataOnInst.size(); i != e; ++i)
-      Inst->setMetadata(MetadataOnInst[i].first, MetadataOnInst[i].second);
-    MetadataOnInst.clear();
-
     BB->getInstList().push_back(Inst);
 
     // Set the name on the instruction.
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 9abe40427367..ae460bbb57b3 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -17,6 +17,7 @@
 #include "LLLexer.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/ValueHandle.h"
 #include <map>
 
@@ -76,6 +77,14 @@ namespace llvm {
     LLVMContext& Context;
     LLLexer Lex;
     Module *M;
+    
+    // Instruction metadata resolution.  Each instruction can have a list of
+    // MDRef info associated with them.
+    struct MDRef {
+      SMLoc Loc;
+      unsigned MDKind, MDSlot;
+    };
+    DenseMap<Instruction*, std::vector<MDRef> > ForwardRefInstMetadata;
 
     // Type resolution handling data structures.
     std::map<std::string, std::pair<PATypeHolder, LocTy> > ForwardRefTypes;
@@ -171,8 +180,7 @@ namespace llvm {
     bool ParseOptionalCallingConv(CallingConv::ID &CC);
     bool ParseOptionalAlignment(unsigned &Alignment);
     bool ParseOptionalStackAlignment(unsigned &Alignment);
-    bool ParseInstructionMetadata(SmallVectorImpl<std::pair<unsigned,
-                                                            MDNode *> > &);
+    bool ParseInstructionMetadata(Instruction *Inst);
     bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
     bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,bool &AteExtraComma);
     bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
@@ -204,6 +212,7 @@ namespace llvm {
     bool ParseNamedMetadata();
     bool ParseMDString(MDString *&Result);
     bool ParseMDNodeID(MDNode *&Result);
+    bool ParseMDNodeID(MDNode *&Result, unsigned &SlotNo);
 
     // Type Parsing.
     bool ParseType(PATypeHolder &Result, bool AllowVoid = false);
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index b9453c90b583..76d112e045a3 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1527,12 +1527,16 @@ bool BitcodeReader::ParseModule() {
 bool BitcodeReader::ParseBitcodeInto(Module *M) {
   TheModule = 0;
 
-  if (Buffer->getBufferSize() & 3)
-    return Error("Bitcode stream should be a multiple of 4 bytes in length");
-
   unsigned char *BufPtr = (unsigned char *)Buffer->getBufferStart();
   unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
+  if (Buffer->getBufferSize() & 3) {
+    if (!isRawBitcode(BufPtr, BufEnd) && !isBitcodeWrapper(BufPtr, BufEnd))
+      return Error("Invalid bitcode signature");
+    else
+      return Error("Bitcode stream should be a multiple of 4 bytes in length");
+  }
+
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
   // The magic number is 0x0B17C0DE stored in little endian.
   if (isBitcodeWrapper(BufPtr, BufEnd))
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 3ab272666749..1f69e1685166 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1090,11 +1090,11 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
 
     // Emit value #'s for the fixed parameters.
     for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-      Vals.push_back(VE.getValueID(I.getOperand(i+3)));  // fixed param.
+      Vals.push_back(VE.getValueID(I.getOperand(i)));  // fixed param.
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
-      for (unsigned i = 3+FTy->getNumParams(), e = I.getNumOperands();
+      for (unsigned i = FTy->getNumParams(), e = I.getNumOperands()-3;
            i != e; ++i)
         PushValueAndType(I.getOperand(i), InstID, Vals, VE); // vararg
     }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 1d4f7f7ae68d..3e71d18b5261 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -62,7 +62,7 @@ AsmPrinter::AsmPrinter(formatted_raw_ostream &o, TargetMachine &tm,
     TM(tm), MAI(tm.getMCAsmInfo()), TRI(tm.getRegisterInfo()),
     OutContext(Streamer.getContext()),
     OutStreamer(Streamer),
-    LastMI(0), LastFn(0), Counter(~0U), SetCounter(0), PrevDLT(NULL) {
+    LastMI(0), LastFn(0), Counter(~0U), SetCounter(0) {
   DW = 0; MMI = 0;
   VerboseAsm = Streamer.isVerboseAsm();
 }
@@ -922,8 +922,8 @@ void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
 
   // Otherwise, emit with .set (aka assignment).
   MCSymbol *SetLabel =
-    OutContext.GetOrCreateTemporarySymbol(Twine(MAI->getPrivateGlobalPrefix()) +
-                                          "set" + Twine(SetCounter++));
+    OutContext.GetOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
+                                 "set" + Twine(SetCounter++));
   OutStreamer.EmitAssignment(SetLabel, Diff);
   OutStreamer.EmitSymbolValue(SetLabel, Size, 0/*AddrSpace*/);
 }
@@ -1337,25 +1337,12 @@ void AsmPrinter::processDebugLoc(const MachineInstr *MI,
   if (!MAI || !DW || !MAI->doesSupportDebugInformation()
       || !DW->ShouldEmitDwarfDebug())
     return;
-  if (MI->getOpcode() == TargetOpcode::DBG_VALUE)
-    return;
-  DebugLoc DL = MI->getDebugLoc();
-  if (DL.isUnknown())
-    return;
-  DILocation CurDLT = MF->getDILocation(DL);
-  if (!CurDLT.getScope().Verify())
-    return;
 
-  if (!BeforePrintingInsn) {
+  if (!BeforePrintingInsn)
     // After printing instruction
     DW->EndScope(MI);
-  } else if (CurDLT.getNode() != PrevDLT) {
-    MCSymbol *L = DW->RecordSourceLine(CurDLT.getLineNumber(), 
-                                       CurDLT.getColumnNumber(),
-                                       CurDLT.getScope().getNode());
-    DW->BeginScope(MI, L);
-    PrevDLT = CurDLT.getNode();
-  }
+  else
+    DW->BeginScope(MI);
 }
 
 
@@ -1612,7 +1599,7 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  return OutContext.GetOrCreateTemporarySymbol
+  return OutContext.GetOrCreateSymbol
     (Twine(MAI->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber())
      + "_" + Twine(CPID));
 }
@@ -1625,7 +1612,7 @@ MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const {
 /// GetJTSetSymbol - Return the symbol for the specified jump table .set
 /// FIXME: privatize to AsmPrinter.
 MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
-  return OutContext.GetOrCreateTemporarySymbol
+  return OutContext.GetOrCreateSymbol
   (Twine(MAI->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" +
    Twine(UID) + "_set_" + Twine(MBBID));
 }
@@ -1639,9 +1626,7 @@ MCSymbol *AsmPrinter::GetSymbolWithGlobalValueBase(const GlobalValue *GV,
   SmallString<60> NameStr;
   Mang->getNameWithPrefix(NameStr, GV, ForcePrivate);
   NameStr.append(Suffix.begin(), Suffix.end());
-  if (!GV->hasPrivateLinkage() && !ForcePrivate)
-    return OutContext.GetOrCreateSymbol(NameStr.str());
-  return OutContext.GetOrCreateTemporarySymbol(NameStr.str());
+  return OutContext.GetOrCreateSymbol(NameStr.str());
 }
 
 /// GetExternalSymbolSymbol - Return the MCSymbol for the specified
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index e97754e5d26f..e0e3ff794672 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
@@ -114,10 +115,11 @@ DIE::~DIE() {
 
 /// addSiblingOffset - Add a sibling offset field to the front of the DIE.
 ///
-void DIE::addSiblingOffset() {
-  DIEInteger *DI = new DIEInteger(0);
+DIEValue *DIE::addSiblingOffset(BumpPtrAllocator &A) {
+  DIEInteger *DI = new (A) DIEInteger(0);
   Values.insert(Values.begin(), DI);
   Abbrev.AddFirstAttribute(dwarf::DW_AT_sibling, dwarf::DW_FORM_ref4);
+  return DI;
 }
 
 #ifndef NDEBUG
@@ -276,31 +278,6 @@ void DIELabel::print(raw_ostream &O) {
 }
 #endif
 
-//===----------------------------------------------------------------------===//
-// DIESectionOffset Implementation
-//===----------------------------------------------------------------------===//
-
-/// EmitValue - Emit delta value.
-///
-void DIESectionOffset::EmitValue(DwarfPrinter *D, unsigned Form) const {
-  bool IsSmall = Form == dwarf::DW_FORM_data4;
-  D->EmitSectionOffset(Label, Section, IsSmall, IsEH);
-}
-
-/// SizeOf - Determine size of delta value in bytes.
-///
-unsigned DIESectionOffset::SizeOf(const TargetData *TD, unsigned Form) const {
-  if (Form == dwarf::DW_FORM_data4) return 4;
-  return TD->getPointerSize();
-}
-
-#ifndef NDEBUG
-void DIESectionOffset::print(raw_ostream &O) {
-  O << "Off: " << Label->getName() << "-" << Section->getName()
-    << "-" << IsEH;
-}
-#endif
-
 //===----------------------------------------------------------------------===//
 // DIEDelta Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index c5909faab7e1..8b27ed20cb63 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -153,7 +153,7 @@ namespace llvm {
     unsigned getOffset() const { return Offset; }
     unsigned getSize() const { return Size; }
     const std::vector<DIE *> &getChildren() const { return Children; }
-    SmallVector<DIEValue*, 32> &getValues() { return Values; }
+    const SmallVector<DIEValue*, 32> &getValues() const { return Values; }
     DIE *getParent() const { return Parent; }
     void setTag(unsigned Tag) { Abbrev.setTag(Tag); }
     void setOffset(unsigned O) { Offset = O; }
@@ -171,8 +171,10 @@ namespace llvm {
     unsigned getSiblingOffset() const { return Offset + Size; }
 
     /// addSiblingOffset - Add a sibling offset field to the front of the DIE.
+    /// The caller is responsible for deleting the return value at or after the
+    /// same time it destroys this DIE.
     ///
-    void addSiblingOffset();
+    DIEValue *addSiblingOffset(BumpPtrAllocator &A);
 
     /// addChild - Add a child to the DIE.
     ///
@@ -322,38 +324,6 @@ namespace llvm {
     static bool classof(const DIELabel *)  { return true; }
     static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
 
-#ifndef NDEBUG
-    virtual void print(raw_ostream &O);
-#endif
-  };
-
-  //===--------------------------------------------------------------------===//
-  /// DIESectionOffset - A section offset DIE.
-  ///
-  class DIESectionOffset : public DIEValue {
-    const MCSymbol *Label;
-    const MCSymbol *Section;
-    bool IsEH : 1;
-  public:
-    DIESectionOffset(const MCSymbol *Lab, const MCSymbol *Sec,
-                     bool isEH = false)
-      : DIEValue(isSectionOffset), Label(Lab), Section(Sec),
-        IsEH(isEH) {}
-
-    /// EmitValue - Emit section offset.
-    ///
-    virtual void EmitValue(DwarfPrinter *D, unsigned Form) const;
-
-    /// SizeOf - Determine size of section offset value in bytes.
-    ///
-    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const;
-
-    // Implement isa/cast/dyncast.
-    static bool classof(const DIESectionOffset *)  { return true; }
-    static bool classof(const DIEValue *D) {
-      return D->getType() == isSectionOffset;
-    }
-
 #ifndef NDEBUG
     virtual void print(raw_ostream &O);
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 7153fe20dcf6..fb91d4f9849f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -92,11 +92,11 @@ public:
 
   /// addGlobal - Add a new global entity to the compile unit.
   ///
-  void addGlobal(const std::string &Name, DIE *Die) { Globals[Name] = Die; }
+  void addGlobal(StringRef Name, DIE *Die) { Globals[Name] = Die; }
 
   /// addGlobalType - Add a new global type to the compile unit.
   ///
-  void addGlobalType(const std::string &Name, DIE *Die) { 
+  void addGlobalType(StringRef Name, DIE *Die) { 
     GlobalTypes[Name] = Die; 
   }
 
@@ -149,20 +149,26 @@ class DbgVariable {
   DIVariable Var;                    // Variable Descriptor.
   unsigned FrameIndex;               // Variable frame index.
   const MachineInstr *DbgValueMInsn; // DBG_VALUE
+  // DbgValueLabel - DBG_VALUE is effective from this label.
+  MCSymbol *DbgValueLabel;
   DbgVariable *const AbstractVar;    // Abstract variable for this variable.
   DIE *TheDIE;
 public:
   // AbsVar may be NULL.
   DbgVariable(DIVariable V, unsigned I, DbgVariable *AbsVar)
-    : Var(V), FrameIndex(I), DbgValueMInsn(0), AbstractVar(AbsVar), TheDIE(0) {}
+    : Var(V), FrameIndex(I), DbgValueMInsn(0), 
+      DbgValueLabel(0), AbstractVar(AbsVar), TheDIE(0) {}
   DbgVariable(DIVariable V, const MachineInstr *MI, DbgVariable *AbsVar)
-    : Var(V), FrameIndex(0), DbgValueMInsn(MI), AbstractVar(AbsVar), TheDIE(0)
+    : Var(V), FrameIndex(0), DbgValueMInsn(MI), DbgValueLabel(0),
+      AbstractVar(AbsVar), TheDIE(0)
     {}
 
   // Accessors.
   DIVariable getVariable()           const { return Var; }
   unsigned getFrameIndex()           const { return FrameIndex; }
   const MachineInstr *getDbgValue()  const { return DbgValueMInsn; }
+  MCSymbol *getDbgValueLabel()       const { return DbgValueLabel; }
+  void setDbgValueLabel(MCSymbol *L)       { DbgValueLabel = L; }
   DbgVariable *getAbstractVariable() const { return AbstractVar; }
   void setDIE(DIE *D)                      { TheDIE = D; }
   DIE *getDIE()                      const { return TheDIE; }
@@ -224,14 +230,14 @@ public:
 
   void fixInstructionMarkers(DenseMap<const MachineInstr *, 
                              unsigned> &MIIndexMap) {
-    assert (getFirstInsn() && "First instruction is missing!");
+    assert(getFirstInsn() && "First instruction is missing!");
     
     // Use the end of last child scope as end of this scope.
     const SmallVector<DbgScope *, 4> &Scopes = getScopes();
     const MachineInstr *LastInsn = getFirstInsn();
     unsigned LIndex = 0;
     if (Scopes.empty()) {
-      assert (getLastInsn() && "Inner most scope does not have last insn!");
+      assert(getLastInsn() && "Inner most scope does not have last insn!");
       return;
     }
     for (SmallVector<DbgScope *, 4>::const_iterator SI = Scopes.begin(),
@@ -295,15 +301,15 @@ DbgScope::~DbgScope() {
 DwarfDebug::DwarfDebug(raw_ostream &OS, AsmPrinter *A, const MCAsmInfo *T)
   : DwarfPrinter(OS, A, T), ModuleCU(0),
     AbbreviationsSet(InitAbbreviationsSetSize), Abbreviations(),
-    DIEValues(), SectionSourceLines(), didInitial(false), shouldEmit(false),
-    CurrentFnDbgScope(0), DebugTimer(0) {
+    DIEBlocks(), SectionSourceLines(), didInitial(false), shouldEmit(false),
+    CurrentFnDbgScope(0), PrevDILoc(0), DebugTimer(0) {
   NextStringPoolNumber = 0;
   if (TimePassesIsEnabled)
     DebugTimer = new Timer("Dwarf Debug Writer");
 }
 DwarfDebug::~DwarfDebug() {
-  for (unsigned j = 0, M = DIEValues.size(); j < M; ++j)
-    delete DIEValues[j];
+  for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j)
+    DIEBlocks[j]->~DIEBlock();
 
   delete DebugTimer;
 }
@@ -343,8 +349,7 @@ void DwarfDebug::assignAbbrevNumber(DIEAbbrev &Abbrev) {
 /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
 /// information entry.
 DIEEntry *DwarfDebug::createDIEEntry(DIE *Entry) {
-  DIEEntry *Value = new DIEEntry(Entry);
-  DIEValues.push_back(Value);
+  DIEEntry *Value = new (DIEValueAllocator) DIEEntry(Entry);
   return Value;
 }
 
@@ -353,8 +358,7 @@ DIEEntry *DwarfDebug::createDIEEntry(DIE *Entry) {
 void DwarfDebug::addUInt(DIE *Die, unsigned Attribute,
                          unsigned Form, uint64_t Integer) {
   if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = new DIEInteger(Integer);
-  DIEValues.push_back(Value);
+  DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
   Die->addValue(Attribute, Form, Value);
 }
 
@@ -363,8 +367,7 @@ void DwarfDebug::addUInt(DIE *Die, unsigned Attribute,
 void DwarfDebug::addSInt(DIE *Die, unsigned Attribute,
                          unsigned Form, int64_t Integer) {
   if (!Form) Form = DIEInteger::BestForm(true, Integer);
-  DIEValue *Value = new DIEInteger(Integer);
-  DIEValues.push_back(Value);
+  DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
   Die->addValue(Attribute, Form, Value);
 }
 
@@ -372,8 +375,7 @@ void DwarfDebug::addSInt(DIE *Die, unsigned Attribute,
 /// keeps string reference. 
 void DwarfDebug::addString(DIE *Die, unsigned Attribute, unsigned Form,
                            StringRef String) {
-  DIEValue *Value = new DIEString(String);
-  DIEValues.push_back(Value);
+  DIEValue *Value = new (DIEValueAllocator) DIEString(String);
   Die->addValue(Attribute, Form, Value);
 }
 
@@ -381,18 +383,7 @@ void DwarfDebug::addString(DIE *Die, unsigned Attribute, unsigned Form,
 ///
 void DwarfDebug::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
                           const MCSymbol *Label) {
-  DIEValue *Value = new DIELabel(Label);
-  DIEValues.push_back(Value);
-  Die->addValue(Attribute, Form, Value);
-}
-
-/// addSectionOffset - Add a section offset label attribute data and value.
-///
-void DwarfDebug::addSectionOffset(DIE *Die, unsigned Attribute, unsigned Form,
-                                  const MCSymbol *Label,const MCSymbol *Section,
-                                  bool isEH) {
-  DIEValue *Value = new DIESectionOffset(Label, Section, isEH);
-  DIEValues.push_back(Value);
+  DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
   Die->addValue(Attribute, Form, Value);
 }
 
@@ -400,8 +391,7 @@ void DwarfDebug::addSectionOffset(DIE *Die, unsigned Attribute, unsigned Form,
 ///
 void DwarfDebug::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
                           const MCSymbol *Hi, const MCSymbol *Lo) {
-  DIEValue *Value = new DIEDelta(Hi, Lo);
-  DIEValues.push_back(Value);
+  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
   Die->addValue(Attribute, Form, Value);
 }
 
@@ -410,7 +400,7 @@ void DwarfDebug::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
 void DwarfDebug::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
                           DIEBlock *Block) {
   Block->ComputeSize(TD);
-  DIEValues.push_back(Block);
+  DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
   Die->addValue(Attribute, Block->BestForm(), Block);
 }
 
@@ -457,8 +447,8 @@ void DwarfDebug::addSourceLine(DIE *Die, const DISubprogram *SP) {
   unsigned Line = SP->getLineNumber();
   if (!SP->getContext().Verify())
     return;
-  unsigned FileID = GetOrCreateSourceID(SP->getContext().getDirectory(),
-                                        SP->getContext().getFilename());
+  unsigned FileID = GetOrCreateSourceID(SP->getDirectory(),
+                                        SP->getFilename());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -564,7 +554,7 @@ void DwarfDebug::addComplexAddress(DbgVariable *&DV, DIE *Die,
   // Decode the original location, and use that as the start of the byref
   // variable's location.
   unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
-  DIEBlock *Block = new DIEBlock();
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
   if (Location.isReg()) {
     if (Reg < 32) {
@@ -696,15 +686,15 @@ void DwarfDebug::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
   }
 
   // Get the offsets for the forwarding field and the variable field.
-  unsigned int forwardingFieldOffset =
+  unsigned forwardingFieldOffset =
     DIDerivedType(forwardingField.getNode()).getOffsetInBits() >> 3;
-  unsigned int varFieldOffset =
+  unsigned varFieldOffset =
     DIDerivedType(varField.getNode()).getOffsetInBits() >> 3;
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
   unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
-  DIEBlock *Block = new DIEBlock();
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
   if (Location.isReg()) {
     if (Reg < 32)
@@ -759,7 +749,7 @@ void DwarfDebug::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
 void DwarfDebug::addAddress(DIE *Die, unsigned Attribute,
                             const MachineLocation &Location) {
   unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
-  DIEBlock *Block = new DIEBlock();
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
   if (Location.isReg()) {
     if (Reg < 32) {
@@ -1106,7 +1096,7 @@ DIE *DwarfDebug::createMemberDIE(const DIDerivedType &DT) {
 
   addSourceLine(MemberDie, &DT);
 
-  DIEBlock *MemLocationDie = new DIEBlock();
+  DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
   addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
 
   uint64_t Size = DT.getSizeInBits();
@@ -1142,7 +1132,7 @@ DIE *DwarfDebug::createMemberDIE(const DIDerivedType &DT) {
     // expression to extract appropriate offset from vtable.
     // BaseAddr = ObAddr + *((*ObAddr) - Offset)
 
-    DIEBlock *VBaseLocationDie = new DIEBlock();
+    DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock();
     addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
     addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
@@ -1208,7 +1198,7 @@ DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
   unsigned VK = SP.getVirtuality();
   if (VK) {
     addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK);
-    DIEBlock *Block = new DIEBlock();
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
     addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
     addUInt(Block, 0, dwarf::DW_FORM_data1, SP.getVirtualIndex());
     addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
@@ -1244,13 +1234,13 @@ DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
   return SPDie;
 }
 
-/// getUpdatedDbgScope - Find or create DbgScope assicated with the instruction.
-/// Initialize scope and update scope hierarchy.
+/// getUpdatedDbgScope - Find DbgScope assicated with the instruction.
+/// Update scope hierarchy. Create abstract scope if required.
 DbgScope *DwarfDebug::getUpdatedDbgScope(MDNode *N, const MachineInstr *MI,
-  MDNode *InlinedAt) {
-  assert (N && "Invalid Scope encoding!");
-  assert (MI && "Missing machine instruction!");
-  bool GetConcreteScope = (MI && InlinedAt);
+                                         MDNode *InlinedAt) {
+  assert(N && "Invalid Scope encoding!");
+  assert(MI && "Missing machine instruction!");
+  bool isAConcreteScope = InlinedAt != 0;
 
   DbgScope *NScope = NULL;
 
@@ -1258,17 +1248,17 @@ DbgScope *DwarfDebug::getUpdatedDbgScope(MDNode *N, const MachineInstr *MI,
     NScope = DbgScopeMap.lookup(InlinedAt);
   else
     NScope = DbgScopeMap.lookup(N);
-  assert (NScope && "Unable to find working scope!");
+  assert(NScope && "Unable to find working scope!");
 
   if (NScope->getFirstInsn())
     return NScope;
 
   DbgScope *Parent = NULL;
-  if (GetConcreteScope) {
+  if (isAConcreteScope) {
     DILocation IL(InlinedAt);
     Parent = getUpdatedDbgScope(IL.getScope().getNode(), MI,
                          IL.getOrigLocation().getNode());
-    assert (Parent && "Unable to find Parent scope!");
+    assert(Parent && "Unable to find Parent scope!");
     NScope->setParent(Parent);
     Parent->addScope(NScope);
   } else if (DIDescriptor(N).isLexicalBlock()) {
@@ -1286,7 +1276,7 @@ DbgScope *DwarfDebug::getUpdatedDbgScope(MDNode *N, const MachineInstr *MI,
       CurrentFnDbgScope = NScope;
   }
 
-  if (GetConcreteScope) {
+  if (isAConcreteScope) {
     ConcreteScopes[InlinedAt] = NScope;
     getOrCreateAbstractScope(N);
   }
@@ -1295,7 +1285,7 @@ DbgScope *DwarfDebug::getUpdatedDbgScope(MDNode *N, const MachineInstr *MI,
 }
 
 DbgScope *DwarfDebug::getOrCreateAbstractScope(MDNode *N) {
-  assert (N && "Invalid Scope encoding!");
+  assert(N && "Invalid Scope encoding!");
 
   DbgScope *AScope = AbstractScopes.lookup(N);
   if (AScope)
@@ -1377,7 +1367,7 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(MDNode *SPNode) {
 DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
   MCSymbol *Start = Scope->getStartLabel();
   MCSymbol *End = Scope->getEndLabel();
-  if (Start == 0) return 0;
+  if (Start == 0 || End == 0) return 0;
 
   assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
   assert(End->isDefined() && "Invalid end label for an inlined scope!");
@@ -1400,7 +1390,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
 DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
   MCSymbol *StartLabel = Scope->getStartLabel();
   MCSymbol *EndLabel = Scope->getEndLabel();
-  if (StartLabel == 0) return 0;
+  if (StartLabel == 0 || EndLabel == 0) return 0;
   
   assert(StartLabel->isDefined() &&
          "Invalid starting label for an inlined scope!");
@@ -1413,7 +1403,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
 
   DISubprogram InlinedSP = getDISubprogram(DS.getNode());
   DIE *OriginDIE = ModuleCU->getDIE(InlinedSP.getNode());
-  assert (OriginDIE && "Unable to find Origin DIE!");
+  assert(OriginDIE && "Unable to find Origin DIE!");
   addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
               dwarf::DW_FORM_ref4, OriginDIE);
 
@@ -1477,9 +1467,9 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
     DISubprogram InlinedSP = getDISubprogram(DS.getNode());
     DIE *OriginSPDIE = ModuleCU->getDIE(InlinedSP.getNode());
     (void) OriginSPDIE;
-    assert (OriginSPDIE && "Unable to find Origin DIE for the SP!");
+    assert(OriginSPDIE && "Unable to find Origin DIE for the SP!");
     DIE *AbsDIE = DV->getAbstractVariable()->getDIE();
-    assert (AbsDIE && "Unable to find Origin DIE for the Variable!");
+    assert(AbsDIE && "Unable to find Origin DIE for the Variable!");
     addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
                 dwarf::DW_FORM_ref4, AbsDIE);
   }
@@ -1508,12 +1498,18 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
           MachineLocation Location;
           Location.set(DbgValueInsn->getOperand(0).getReg());
           addAddress(VariableDie, dwarf::DW_AT_location, Location);
+          if (MCSymbol *VS = DV->getDbgValueLabel())
+            addLabel(VariableDie, dwarf::DW_AT_start_scope, dwarf::DW_FORM_addr,
+                     VS);
         } else if (DbgValueInsn->getOperand(0).getType() == 
                    MachineOperand::MO_Immediate) {
-          DIEBlock *Block = new DIEBlock();
+          DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
           unsigned Imm = DbgValueInsn->getOperand(0).getImm();
           addUInt(Block, 0, dwarf::DW_FORM_udata, Imm);
           addBlock(VariableDie, dwarf::DW_AT_const_value, 0, Block);
+          if (MCSymbol *VS = DV->getDbgValueLabel())
+            addLabel(VariableDie, dwarf::DW_AT_start_scope, dwarf::DW_FORM_addr,
+                     VS);
         } else {
           //FIXME : Handle other operand types.
           delete VariableDie;
@@ -1523,7 +1519,8 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
     } else {
       MachineLocation Location;
       unsigned FrameReg;
-      int Offset = RI->getFrameIndexReference(*MF, DV->getFrameIndex(), FrameReg);
+      int Offset = RI->getFrameIndexReference(*MF, DV->getFrameIndex(),
+                                              FrameReg);
       Location.set(FrameReg, Offset);
       
       if (VD.hasComplexAddress())
@@ -1576,10 +1573,9 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
     else
       ScopeDIE = updateSubprogramScopeDIE(DS.getNode());
   }
-  else {
+  else
     ScopeDIE = constructLexicalScopeDIE(Scope);
-    if (!ScopeDIE) return NULL;
-  }
+  if (!ScopeDIE) return NULL;
   
   // Add variables to scope.
   const SmallVector<DbgVariable *, 8> &Variables = Scope->getVariables();
@@ -1608,7 +1604,7 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
 /// source file names. If none currently exists, create a new id and insert it
 /// in the SourceIds map. This can update DirectoryNames and SourceFileNames
 /// maps as well.
-unsigned DwarfDebug::GetOrCreateSourceID(StringRef DirName, StringRef FileName) {
+unsigned DwarfDebug::GetOrCreateSourceID(StringRef DirName, StringRef FileName){
   unsigned DId;
   StringMap<unsigned>::iterator DI = DirectoryIdMap.find(DirName);
   if (DI != DirectoryIdMap.end()) {
@@ -1666,15 +1662,19 @@ void DwarfDebug::constructCompileUnit(MDNode *N) {
   unsigned ID = GetOrCreateSourceID(Dir, FN);
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  // FIXME: Why getting the delta between two identical labels??
-  addSectionOffset(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
-                   getTempLabel("section_line"), getTempLabel("section_line"),
-                   false);
   addString(Die, dwarf::DW_AT_producer, dwarf::DW_FORM_string,
             DIUnit.getProducer());
   addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data1,
           DIUnit.getLanguage());
   addString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN);
+  addLabel(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+           getTempLabel("text_begin"));
+  addLabel(Die, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+           getTempLabel("text_end"));
+  // DW_AT_stmt_list is a offset of line number information for this
+  // compile unit in debug_line section. It is always zero when only one
+  // compile unit is emitted in one object file.
+  addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
 
   if (!Dir.empty())
     addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir);
@@ -1717,13 +1717,13 @@ void DwarfDebug::constructGlobalVariableDIE(MDNode *N) {
   DIDescriptor GVContext = DI_GV.getContext();
   // Do not create specification DIE if context is either compile unit
   // or a subprogram.
-  if (DI_GV.isDefinition() && !GVContext.isCompileUnit()
-      && !GVContext.isFile() && !GVContext.isSubprogram()) {
+  if (DI_GV.isDefinition() && !GVContext.isCompileUnit() &&
+      !GVContext.isFile() && !GVContext.isSubprogram()) {
     // Create specification DIE.
     DIE *VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
     addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
                 dwarf::DW_FORM_ref4, VariableDie);
-    DIEBlock *Block = new DIEBlock();
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
     addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
     addLabel(Block, 0, dwarf::DW_FORM_udata,
              Asm->Mang->getSymbol(DI_GV.getGlobal()));
@@ -1731,7 +1731,7 @@ void DwarfDebug::constructGlobalVariableDIE(MDNode *N) {
     addUInt(VariableDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
     ModuleCU->addDie(VariableSpecDIE);
   } else {
-    DIEBlock *Block = new DIEBlock();
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
     addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
     addLabel(Block, 0, dwarf::DW_FORM_udata,
              Asm->Mang->getSymbol(DI_GV.getGlobal()));
@@ -1745,7 +1745,7 @@ void DwarfDebug::constructGlobalVariableDIE(MDNode *N) {
   DIType GTy = DI_GV.getType();
   if (GTy.isCompositeType() && !GTy.getName().empty()) {
     DIEEntry *Entry = ModuleCU->getDIEEntry(GTy.getNode());
-    assert (Entry && "Missing global type!");
+    assert(Entry && "Missing global type!");
     ModuleCU->addGlobalType(GTy.getName(), Entry->getEntry());
   }
   return;
@@ -1783,12 +1783,11 @@ void DwarfDebug::constructSubprogramDIE(MDNode *N) {
 void DwarfDebug::beginModule(Module *M, MachineModuleInfo *mmi) {
   this->M = M;
 
-  if (TimePassesIsEnabled)
-    DebugTimer->startTimer();
-
   if (!MAI->doesSupportDebugInformation())
     return;
 
+  TimeRegion Timer(DebugTimer);
+  
   DebugInfoFinder DbgFinder;
   DbgFinder.processModule(*M);
 
@@ -1836,9 +1835,6 @@ void DwarfDebug::beginModule(Module *M, MachineModuleInfo *mmi) {
 
   // Emit initial sections
   emitInitial();
-
-  if (TimePassesIsEnabled)
-    DebugTimer->stopTimer();
 }
 
 /// endModule - Emit all Dwarf sections that should come after the content.
@@ -1847,8 +1843,7 @@ void DwarfDebug::endModule() {
   if (!ModuleCU)
     return;
 
-  if (TimePassesIsEnabled)
-    DebugTimer->startTimer();
+  TimeRegion Timer(DebugTimer);
 
   // Attach DW_AT_inline attribute with inlined subprogram DIEs.
   for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
@@ -1871,7 +1866,7 @@ void DwarfDebug::endModule() {
     if (!NDie) continue;
     addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
     // FIXME - This is not the correct approach.
-    // addDIEEntry(NDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+    //addDIEEntry(NDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie
   }
 
   // Standard sections final addresses.
@@ -1932,9 +1927,6 @@ void DwarfDebug::endModule() {
   
   delete ModuleCU;
   ModuleCU = NULL;  // Reset for the next Module, if any.
-
-  if (TimePassesIsEnabled)
-    DebugTimer->stopTimer();
 }
 
 /// findAbstractVariable - Find abstract variable, if any, associated with Var.
@@ -1971,10 +1963,11 @@ DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &Var,
   if (!Scope)
     return NULL;
 
-  AbsDbgVariable = new DbgVariable(Var, MI, 
+  AbsDbgVariable = new DbgVariable(Var, MI,
                                    NULL /* No more-abstract variable*/);
   Scope->addVariable(AbsDbgVariable);
   AbstractVariables[Var.getNode()] = AbsDbgVariable;
+  DbgValueStartMap[MI] = AbsDbgVariable;
   return AbsDbgVariable;
 }
 
@@ -2010,16 +2003,19 @@ void DwarfDebug::collectVariableInfo() {
     for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
          II != IE; ++II) {
       const MachineInstr *MInsn = II;
-      if (MInsn->getOpcode() != TargetOpcode::DBG_VALUE)
+      if (!MInsn->isDebugValue())
         continue;
+
       // FIXME : Lift this restriction.
       if (MInsn->getNumOperands() != 3)
         continue;
-      DIVariable DV((MDNode*)(MInsn->getOperand(MInsn->getNumOperands() - 1).getMetadata()));
+      DIVariable DV((MDNode*)(MInsn->getOperand(MInsn->getNumOperands()
+                                                - 1).getMetadata()));
       if (DV.getTag() == dwarf::DW_TAG_arg_variable)  {
         // FIXME Handle inlined subroutine arguments.
         DbgVariable *ArgVar = new DbgVariable(DV, MInsn, NULL);
         CurrentFnDbgScope->addVariable(ArgVar);
+        DbgValueStartMap[MInsn] = ArgVar;
         continue;
       }
 
@@ -2034,19 +2030,54 @@ void DwarfDebug::collectVariableInfo() {
       if (!Scope)
         continue;
 
-      DbgVariable *AbsDbgVariable = findAbstractVariable(DV, MInsn,
-                                                         ScopeLoc);
+      DbgVariable *AbsDbgVariable = findAbstractVariable(DV, MInsn, ScopeLoc);
       DbgVariable *RegVar = new DbgVariable(DV, MInsn, AbsDbgVariable);
+      DbgValueStartMap[MInsn] = RegVar;
       Scope->addVariable(RegVar);
     }
   }
 }
 
-/// beginScope - Process beginning of a scope starting at Label.
-void DwarfDebug::beginScope(const MachineInstr *MI, MCSymbol *Label) {
+/// beginScope - Process beginning of a scope.
+void DwarfDebug::beginScope(const MachineInstr *MI) {
+  // Check location.
+  DebugLoc DL = MI->getDebugLoc();
+  if (DL.isUnknown())
+    return;
+  DILocation DILoc = MF->getDILocation(DL);
+  if (!DILoc.getScope().Verify())
+    return;
+
+  // Check and update last known location info.
+  if(DILoc.getNode() == PrevDILoc)
+    return;
+
+  // DBG_VALUE instruction establishes new value.
+  if (MI->isDebugValue()) {
+    DenseMap<const MachineInstr *, DbgVariable *>::iterator DI
+      = DbgValueStartMap.find(MI);
+    if (DI != DbgValueStartMap.end()) {
+      MCSymbol *Label = recordSourceLine(DILoc.getLineNumber(),
+                                         DILoc.getColumnNumber(),
+                                         DILoc.getScope().getNode());
+      PrevDILoc = DILoc.getNode();
+      DI->second->setDbgValueLabel(Label);
+    }
+    return;
+  }
+
+  // Emit a label to indicate location change. This is used for line 
+  // table even if this instruction does start a new scope.
+  MCSymbol *Label = recordSourceLine(DILoc.getLineNumber(),
+                                     DILoc.getColumnNumber(),
+                                     DILoc.getScope().getNode());
+  PrevDILoc = DILoc.getNode();
+
+  // update DbgScope if this instruction starts a new scope.
   InsnToDbgScopeMapTy::iterator I = DbgScopeBeginMap.find(MI);
   if (I == DbgScopeBeginMap.end())
     return;
+
   ScopeVector &SD = I->second;
   for (ScopeVector::iterator SDI = SD.begin(), SDE = SD.end();
        SDI != SDE; ++SDI)
@@ -2055,6 +2086,19 @@ void DwarfDebug::beginScope(const MachineInstr *MI, MCSymbol *Label) {
 
 /// endScope - Process end of a scope.
 void DwarfDebug::endScope(const MachineInstr *MI) {
+  // Ignore DBG_VALUE instruction.
+  if (MI->isDebugValue())
+    return;
+
+  // Check location.
+  DebugLoc DL = MI->getDebugLoc();
+  if (DL.isUnknown())
+    return;
+  DILocation DILoc = MF->getDILocation(DL);
+  if (!DILoc.getScope().Verify())
+    return;
+  
+  // Emit a label and update DbgScope if this instruction ends a scope.
   InsnToDbgScopeMapTy::iterator I = DbgScopeEndMap.find(MI);
   if (I == DbgScopeEndMap.end())
     return;
@@ -2094,7 +2138,7 @@ void DwarfDebug::createDbgScope(MDNode *Scope, MDNode *InlinedAt) {
 }
 
 /// extractScopeInformation - Scan machine instructions in this function
-/// and collect DbgScopes. Return true, if atleast one scope was found.
+/// and collect DbgScopes. Return true, if at least one scope was found.
 bool DwarfDebug::extractScopeInformation() {
   // If scope information was extracted using .dbg intrinsics then there is not
   // any need to extract these information by scanning each instruction.
@@ -2110,12 +2154,13 @@ bool DwarfDebug::extractScopeInformation() {
          II != IE; ++II) {
       const MachineInstr *MInsn = II;
       // FIXME : Remove DBG_VALUE check.
-      if (MInsn->getOpcode() == TargetOpcode::DBG_VALUE) continue;
+      if (MInsn->isDebugValue()) continue;
       MIIndexMap[MInsn] = MIIndex++;
       DebugLoc DL = MInsn->getDebugLoc();
       if (DL.isUnknown()) continue;
       DILocation DLT = MF->getDILocation(DL);
       DIScope DLTScope = DLT.getScope();
+      if (!DLTScope.getNode()) continue;
       // There is no need to create another DIE for compile unit. For all
       // other scopes, create one DbgScope now. This will be translated
       // into a scope DIE at the end.
@@ -2132,11 +2177,12 @@ bool DwarfDebug::extractScopeInformation() {
          II != IE; ++II) {
       const MachineInstr *MInsn = II;
       // FIXME : Remove DBG_VALUE check.
-      if (MInsn->getOpcode() == TargetOpcode::DBG_VALUE) continue;
+      if (MInsn->isDebugValue()) continue;
       DebugLoc DL = MInsn->getDebugLoc();
       if (DL.isUnknown())  continue;
       DILocation DLT = MF->getDILocation(DL);
       DIScope DLTScope = DLT.getScope();
+      if (!DLTScope.getNode()) continue;
       // There is no need to create another DIE for compile unit. For all
       // other scopes, create one DbgScope now. This will be translated
       // into a scope DIE at the end.
@@ -2159,7 +2205,7 @@ bool DwarfDebug::extractScopeInformation() {
   SmallVector<DbgScope *, 4> WorkList;
   WorkList.push_back(CurrentFnDbgScope);
   while (!WorkList.empty()) {
-    DbgScope *S = WorkList.back(); WorkList.pop_back();
+    DbgScope *S = WorkList.pop_back_val();
 
     const SmallVector<DbgScope *, 4> &Children = S->getScopes();
     if (!Children.empty()) 
@@ -2170,7 +2216,7 @@ bool DwarfDebug::extractScopeInformation() {
     if (S->isAbstractScope())
       continue;
     const MachineInstr *MI = S->getFirstInsn();
-    assert (MI && "DbgScope does not have first instruction!");
+    assert(MI && "DbgScope does not have first instruction!");
 
     InsnToDbgScopeMapTy::iterator IDI = DbgScopeBeginMap.find(MI);
     if (IDI != DbgScopeBeginMap.end())
@@ -2179,7 +2225,7 @@ bool DwarfDebug::extractScopeInformation() {
       DbgScopeBeginMap[MI].push_back(S);
 
     MI = S->getLastInsn();
-    assert (MI && "DbgScope does not have last instruction!");
+    assert(MI && "DbgScope does not have last instruction!");
     IDI = DbgScopeEndMap.find(MI);
     if (IDI != DbgScopeEndMap.end())
       IDI->second.push_back(S);
@@ -2196,12 +2242,10 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   this->MF = MF;
 
   if (!ShouldEmitDwarfDebug()) return;
-
-  if (TimePassesIsEnabled)
-    DebugTimer->startTimer();
-
   if (!extractScopeInformation())
     return;
+  
+  TimeRegion Timer(DebugTimer);
 
   collectVariableInfo();
 
@@ -2225,20 +2269,15 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     
     recordSourceLine(Line, Col, DLT.getScope().getNode());
   }
-  if (TimePassesIsEnabled)
-    DebugTimer->stopTimer();
 }
 
 /// endFunction - Gather and emit post-function debug information.
 ///
 void DwarfDebug::endFunction(const MachineFunction *MF) {
   if (!ShouldEmitDwarfDebug()) return;
-
-  if (TimePassesIsEnabled)
-    DebugTimer->startTimer();
-
-  if (DbgScopeMap.empty())
-    return;
+  if (DbgScopeMap.empty()) return;
+  
+  TimeRegion Timer(DebugTimer);
 
   if (CurrentFnDbgScope) {
     // Define end label for subprogram.
@@ -2271,14 +2310,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   DeleteContainerSeconds(DbgScopeMap);
   DbgScopeBeginMap.clear();
   DbgScopeEndMap.clear();
+  DbgValueStartMap.clear();
   ConcreteScopes.clear();
   DeleteContainerSeconds(AbstractScopes);
   AbstractScopesList.clear();
   AbstractVariables.clear();
   Lines.clear();
-  
-  if (TimePassesIsEnabled)
-    DebugTimer->stopTimer();
 }
 
 /// recordSourceLine - Register a source line with debug info. Returns the
@@ -2288,8 +2325,7 @@ MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, MDNode *S) {
   if (!MMI)
     return 0;
 
-  if (TimePassesIsEnabled)
-    DebugTimer->startTimer();
+  TimeRegion Timer(DebugTimer);
 
   StringRef Dir;
   StringRef Fn;
@@ -2314,9 +2350,6 @@ MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, MDNode *S) {
   MCSymbol *Label = MMI->getContext().CreateTempSymbol();
   Lines.push_back(SrcLineInfo(Line, Col, Src, Label));
 
-  if (TimePassesIsEnabled)
-    DebugTimer->stopTimer();
-
   Asm->OutStreamer.EmitLabel(Label);
   return Label;
 }
@@ -2328,15 +2361,8 @@ MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, MDNode *S) {
 /// well.
 unsigned DwarfDebug::getOrCreateSourceID(const std::string &DirName,
                                          const std::string &FileName) {
-  if (TimePassesIsEnabled)
-    DebugTimer->startTimer();
-
-  unsigned SrcId = GetOrCreateSourceID(DirName.c_str(), FileName.c_str());
-
-  if (TimePassesIsEnabled)
-    DebugTimer->stopTimer();
-
-  return SrcId;
+  TimeRegion Timer(DebugTimer);
+  return GetOrCreateSourceID(DirName.c_str(), FileName.c_str());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2351,7 +2377,8 @@ DwarfDebug::computeSizeAndOffset(DIE *Die, unsigned Offset, bool Last) {
   const std::vector<DIE *> &Children = Die->getChildren();
 
   // If not last sibling and has children then add sibling offset attribute.
-  if (!Last && !Children.empty()) Die->addSiblingOffset();
+  if (!Last && !Children.empty())
+    Die->addSiblingOffset(DIEValueAllocator);
 
   // Record the abbreviation.
   assignAbbrevNumber(Die->getAbbrev());
@@ -2465,7 +2492,7 @@ void DwarfDebug::emitDIE(DIE *Die) {
                                 dwarf::TagString(Abbrev->getTag()));
   EmitULEB128(AbbrevNumber);
 
-  SmallVector<DIEValue*, 32> &Values = Die->getValues();
+  const SmallVector<DIEValue*, 32> &Values = Die->getValues();
   const SmallVector<DIEAbbrevData, 8> &AbbrevData = Abbrev->getData();
 
   // Emit the DIE attribute values.
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index d6634e1f5d10..ad6b0c2cb7cd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -98,9 +99,11 @@ class DwarfDebug : public DwarfPrinter {
   /// Lines - List of source line correspondence.
   std::vector<SrcLineInfo> Lines;
 
-  /// DIEValues - A list of all the unique values in use.
-  ///
-  std::vector<DIEValue *> DIEValues;
+  /// DIEBlocks - A list of all the DIEBlocks in use.
+  std::vector<DIEBlock *> DIEBlocks;
+
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
 
   /// StringPool - A String->Symbol mapping of strings used by indirect
   /// references.
@@ -141,12 +144,21 @@ class DwarfDebug : public DwarfPrinter {
   /// AbstractScopes - Tracks the abstract scopes a module. These scopes are
   /// not included DbgScopeMap.  AbstractScopes owns its DbgScope*s.
   DenseMap<MDNode *, DbgScope *> AbstractScopes;
+
+  /// AbstractScopesList - Tracks abstract scopes constructed while processing
+  /// a function. This list is cleared during endFunction().
   SmallVector<DbgScope *, 4>AbstractScopesList;
 
   /// AbstractVariables - Collection on abstract variables.  Owned by the
   /// DbgScopes in AbstractScopes.
   DenseMap<MDNode *, DbgVariable *> AbstractVariables;
 
+  /// DbgValueStartMap - Tracks starting scope of variable DIEs.
+  /// If the scope of an object begins sometime after the low pc value for the 
+  /// scope most closely enclosing the object, the object entry may have a 
+  /// DW_AT_start_scope attribute.
+  DenseMap<const MachineInstr *, DbgVariable *> DbgValueStartMap;
+
   /// InliendSubprogramDIEs - Collection of subprgram DIEs that are marked
   /// (at the end of the module) as DW_AT_inline.
   SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
@@ -181,6 +193,10 @@ class DwarfDebug : public DwarfPrinter {
   /// function.
   DenseMap<CompileUnit *, unsigned> CompileUnitOffsets;
 
+  /// Previous instruction's location information. This is used to determine
+  /// label location to indicate scope boundries in dwarf debug info.
+  mutable const MDNode *PrevDILoc;
+
   /// DebugTimer - Timer for the Dwarf debug writer.
   Timer *DebugTimer;
   
@@ -250,12 +266,6 @@ class DwarfDebug : public DwarfPrinter {
   void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
                 const MCSymbol *Label);
 
-  /// addSectionOffset - Add a section offset label attribute data and value.
-  ///
-  void addSectionOffset(DIE *Die, unsigned Attribute, unsigned Form,
-                        const MCSymbol *Label, const MCSymbol *Section,
-                        bool isEH = false);
-
   /// addDelta - Add a label delta attribute data and value.
   ///
   void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
@@ -545,8 +555,8 @@ public:
   /// collectVariableInfo - Populate DbgScope entries with variables' info.
   void collectVariableInfo();
 
-  /// beginScope - Process beginning of a scope starting at Label.
-  void beginScope(const MachineInstr *MI, MCSymbol *Label);
+  /// beginScope - Process beginning of a scope.
+  void beginScope(const MachineInstr *MI);
 
   /// endScope - Prcess end of a scope.
   void endScope(const MachineInstr *MI);
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 4946b4c3c25e..8b616b012614 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -419,23 +419,24 @@ bool DwarfException::CallToNoUnwindFunction(const MachineInstr *MI) {
   for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
 
-    if (MO.isGlobal()) {
-      if (Function *F = dyn_cast<Function>(MO.getGlobal())) {
-        if (SawFunc) {
-          // Be conservative. If we have more than one function operand for this
-          // call, then we can't make the assumption that it's the callee and
-          // not a parameter to the call.
-          // 
-          // FIXME: Determine if there's a way to say that `F' is the callee or
-          // parameter.
-          MarkedNoUnwind = false;
-          break;
-        }
+    if (!MO.isGlobal()) continue;
+    
+    Function *F = dyn_cast<Function>(MO.getGlobal());
+    if (F == 0) continue;
 
-        MarkedNoUnwind = F->doesNotThrow();
-        SawFunc = true;
-      }
+    if (SawFunc) {
+      // Be conservative. If we have more than one function operand for this
+      // call, then we can't make the assumption that it's the callee and
+      // not a parameter to the call.
+      // 
+      // FIXME: Determine if there's a way to say that `F' is the callee or
+      // parameter.
+      MarkedNoUnwind = false;
+      break;
     }
+
+    MarkedNoUnwind = F->doesNotThrow();
+    SawFunc = true;
   }
 
   return MarkedNoUnwind;
@@ -504,7 +505,10 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
       LastLabel = LandingPad->EndLabels[P.RangeIndex];
       assert(BeginLabel && LastLabel && "Invalid landing pad!");
 
-      if (LandingPad->LandingPadLabel) {
+      if (!LandingPad->LandingPadLabel) {
+        // Create a gap.
+        PreviousIsInvoke = false;
+      } else {
         // This try-range is for an invoke.
         CallSiteEntry Site = {
           BeginLabel,
@@ -536,9 +540,6 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
           CallSites[SiteNo - 1] = Site;
         }
         PreviousIsInvoke = true;
-      } else {
-        // Create a gap.
-        PreviousIsInvoke = false;
       }
     }
   }
@@ -885,8 +886,7 @@ void DwarfException::EndModule() {
   if (!shouldEmitMovesModule && !shouldEmitTableModule)
     return;
 
-  if (TimePassesIsEnabled)
-    ExceptionTimer->startTimer();
+  TimeRegion Timer(ExceptionTimer);
 
   const std::vector<Function *> Personalities = MMI->getPersonalities();
 
@@ -896,9 +896,6 @@ void DwarfException::EndModule() {
   for (std::vector<FunctionEHFrameInfo>::iterator
          I = EHFrames.begin(), E = EHFrames.end(); I != E; ++I)
     EmitFDE(*I);
-
-  if (TimePassesIsEnabled)
-    ExceptionTimer->stopTimer();
 }
 
 /// BeginFunction - Gather pre-function exception information. Assumes it's
@@ -906,9 +903,7 @@ void DwarfException::EndModule() {
 void DwarfException::BeginFunction(const MachineFunction *MF) {
   if (!MMI || !MAI->doesSupportExceptionHandling()) return;
 
-  if (TimePassesIsEnabled)
-    ExceptionTimer->startTimer();
-
+  TimeRegion Timer(ExceptionTimer);
   this->MF = MF;
   shouldEmitTable = shouldEmitMoves = false;
 
@@ -924,9 +919,6 @@ void DwarfException::BeginFunction(const MachineFunction *MF) {
 
   shouldEmitTableModule |= shouldEmitTable;
   shouldEmitMovesModule |= shouldEmitMoves;
-
-  if (TimePassesIsEnabled)
-    ExceptionTimer->stopTimer();
 }
 
 /// EndFunction - Gather and emit post-function exception information.
@@ -934,9 +926,7 @@ void DwarfException::BeginFunction(const MachineFunction *MF) {
 void DwarfException::EndFunction() {
   if (!shouldEmitMoves && !shouldEmitTable) return;
 
-  if (TimePassesIsEnabled)
-    ExceptionTimer->startTimer();
-
+  TimeRegion Timer(ExceptionTimer);
   Asm->OutStreamer.EmitLabel(getDWLabel("eh_func_end", SubprogramCount));
 
   // Record if this personality index uses a landing pad.
@@ -961,7 +951,4 @@ void DwarfException::EndFunction() {
                                          !MMI->getLandingPads().empty(),
                                          MMI->getFrameMoves(),
                                          MF->getFunction()));
-
-  if (TimePassesIsEnabled)
-    ExceptionTimer->stopTimer();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp b/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
index e21269604e24..17eb2e872742 100644
--- a/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
@@ -45,14 +45,14 @@ MCSymbol *DwarfPrinter::getDWLabel(const char *Name, unsigned ID) const {
   
   //assert(ID && "Should use getTempLabel if no ID");
   if (ID == 0) return getTempLabel(Name);
-  return Asm->OutContext.GetOrCreateTemporarySymbol
+  return Asm->OutContext.GetOrCreateSymbol
         (Twine(MAI->getPrivateGlobalPrefix()) + Twine(Name) + Twine(ID));
 }
 
 /// getTempLabel - Return the MCSymbol corresponding to the assembler temporary
 /// label with the specified name.
 MCSymbol *DwarfPrinter::getTempLabel(const char *Name) const {
-  return Asm->OutContext.GetOrCreateTemporarySymbol
+  return Asm->OutContext.GetOrCreateSymbol
      (Twine(MAI->getPrivateGlobalPrefix()) + Name);
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfWriter.cpp b/lib/CodeGen/AsmPrinter/DwarfWriter.cpp
index 9fd4c4478540..a2d7ab1224ee 100644
--- a/lib/CodeGen/AsmPrinter/DwarfWriter.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfWriter.cpp
@@ -73,27 +73,14 @@ void DwarfWriter::EndFunction(const MachineFunction *MF) {
     MMI->EndFunction();
 }
 
-/// RecordSourceLine - Register a source line with debug info. Returns the
-/// unique label that was emitted and which provides correspondence to
-/// the source line list.
-MCSymbol *DwarfWriter::RecordSourceLine(unsigned Line, unsigned Col, 
-                                        MDNode *Scope) {
-  return DD->recordSourceLine(Line, Col, Scope);
-}
-
-/// getRecordSourceLineCount - Count source lines.
-unsigned DwarfWriter::getRecordSourceLineCount() {
-  return DD->getSourceLineCount();
-}
-
 /// ShouldEmitDwarfDebug - Returns true if Dwarf debugging declarations should
 /// be emitted.
 bool DwarfWriter::ShouldEmitDwarfDebug() const {
   return DD && DD->ShouldEmitDwarfDebug();
 }
 
-void DwarfWriter::BeginScope(const MachineInstr *MI, MCSymbol *L) {
-  DD->beginScope(MI, L);
+void DwarfWriter::BeginScope(const MachineInstr *MI) {
+  DD->beginScope(MI);
 }
 void DwarfWriter::EndScope(const MachineInstr *MI) {
   DD->endScope(MI);
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 151e9cd4403d..8f519407ccd4 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -972,15 +972,21 @@ static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
   // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
   // optimize branches that branch to either a return block or an assert block
   // into a fallthrough to the return.
-  if (MBB1->empty() || MBB2->empty()) return false;
+  if (IsEmptyBlock(MBB1) || IsEmptyBlock(MBB2)) return false;
 
   // If there is a clear successor ordering we make sure that one block
   // will fall through to the next
   if (MBB1->isSuccessor(MBB2)) return true;
   if (MBB2->isSuccessor(MBB1)) return false;
 
-  MachineInstr *MBB1I = --MBB1->end();
-  MachineInstr *MBB2I = --MBB2->end();
+  // Neither block consists entirely of debug info (per IsEmptyBlock check),
+  // so we needn't test for falling off the beginning here.
+  MachineBasicBlock::iterator MBB1I = --MBB1->end();
+  while (MBB1I->isDebugValue())
+    --MBB1I;
+  MachineBasicBlock::iterator MBB2I = --MBB2->end();
+  while (MBB2I->isDebugValue())
+    --MBB2I;
   return MBB2I->getDesc().isCall() && !MBB1I->getDesc().isCall();
 }
 
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 39fc85e7649e..8bae9edde721 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -8,19 +8,19 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass mulches exception handling code into a form adapted to code
-// generation.  Required if using dwarf exception handling.
+// generation. Required if using dwarf exception handling.
 //
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "dwarfehprepare"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -40,6 +40,15 @@ namespace {
     // The eh.exception intrinsic.
     Function *ExceptionValueIntrinsic;
 
+    // The eh.selector intrinsic.
+    Function *SelectorIntrinsic;
+
+    // _Unwind_Resume_or_Rethrow call.
+    Constant *URoR;
+
+    // The EH language-specific catch-all type.
+    GlobalVariable *EHCatchAllValue;
+
     // _Unwind_Resume or the target equivalent.
     Constant *RewindFunction;
 
@@ -67,18 +76,88 @@ namespace {
     Instruction *CreateValueLoad(BasicBlock *BB);
 
     /// CreateReadOfExceptionValue - Return the result of the eh.exception
-    /// intrinsic by calling the intrinsic if in a landing pad, or loading
-    /// it from the exception value variable otherwise.
+    /// intrinsic by calling the intrinsic if in a landing pad, or loading it
+    /// from the exception value variable otherwise.
     Instruction *CreateReadOfExceptionValue(BasicBlock *BB) {
       return LandingPads.count(BB) ?
         CreateExceptionValueCall(BB) : CreateValueLoad(BB);
     }
 
+    /// CleanupSelectors - Any remaining eh.selector intrinsic calls which still
+    /// use the ".llvm.eh.catch.all.value" call need to convert to using it's
+    /// initializer instead.
+    bool CleanupSelectors();
+
+    /// FindAllCleanupSelectors - Find all eh.selector calls that are clean-ups.
+    void FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels);
+
+    /// FindAllURoRInvokes - Find all URoR invokes in the function.
+    void FindAllURoRInvokes(SmallPtrSet<InvokeInst*, 32> &URoRInvokes);
+
+    /// HandleURoRInvokes - Handle invokes of "_Unwind_Resume_or_Rethrow"
+    /// calls. The "unwind" part of these invokes jump to a landing pad within
+    /// the current function. This is a candidate to merge the selector
+    /// associated with the URoR invoke with the one from the URoR's landing
+    /// pad.
+    bool HandleURoRInvokes();
+
+    /// FindSelectorAndURoR - Find the eh.selector call and URoR call associated
+    /// with the eh.exception call. This recursively looks past instructions
+    /// which don't change the EH pointer value, like casts or PHI nodes.
+    bool FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
+                             SmallPtrSet<IntrinsicInst*, 8> &SelCalls);
+      
+    /// DoMem2RegPromotion - Take an alloca call and promote it from memory to a
+    /// register.
+    bool DoMem2RegPromotion(Value *V) {
+      AllocaInst *AI = dyn_cast<AllocaInst>(V);
+      if (!AI || !isAllocaPromotable(AI)) return false;
+
+      // Turn the alloca into a register.
+      std::vector<AllocaInst*> Allocas(1, AI);
+      PromoteMemToReg(Allocas, *DT, *DF);
+      return true;
+    }
+
+    /// PromoteStoreInst - Perform Mem2Reg on a StoreInst.
+    bool PromoteStoreInst(StoreInst *SI) {
+      if (!SI || !DT || !DF) return false;
+      if (DoMem2RegPromotion(SI->getOperand(1)))
+        return true;
+      return false;
+    }
+
+    /// PromoteEHPtrStore - Promote the storing of an EH pointer into a
+    /// register. This should get rid of the store and subsequent loads.
+    bool PromoteEHPtrStore(IntrinsicInst *II) {
+      if (!DT || !DF) return false;
+
+      bool Changed = false;
+      StoreInst *SI;
+
+      while (1) {
+        SI = 0;
+        for (Value::use_iterator
+               I = II->use_begin(), E = II->use_end(); I != E; ++I) {
+          SI = dyn_cast<StoreInst>(I);
+          if (SI) break;
+        }
+
+        if (!PromoteStoreInst(SI))
+          break;
+
+        Changed = true;
+      }
+
+      return false;
+    }
+
   public:
     static char ID; // Pass identification, replacement for typeid.
     DwarfEHPrepare(const TargetLowering *tli, bool fast) :
       FunctionPass(&ID), TLI(tli), CompileFast(fast),
-      ExceptionValueIntrinsic(0), RewindFunction(0) {}
+      ExceptionValueIntrinsic(0), SelectorIntrinsic(0),
+      URoR(0), EHCatchAllValue(0), RewindFunction(0) {}
 
     virtual bool runOnFunction(Function &Fn);
 
@@ -105,6 +184,233 @@ FunctionPass *llvm::createDwarfEHPass(const TargetLowering *tli, bool fast) {
   return new DwarfEHPrepare(tli, fast);
 }
 
+/// FindAllCleanupSelectors - Find all eh.selector calls that are clean-ups.
+void DwarfEHPrepare::
+FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels) {
+  for (Value::use_iterator
+         I = SelectorIntrinsic->use_begin(),
+         E = SelectorIntrinsic->use_end(); I != E; ++I) {
+    IntrinsicInst *SI = cast<IntrinsicInst>(I);
+    if (!SI || SI->getParent()->getParent() != F) continue;
+
+    unsigned NumOps = SI->getNumOperands();
+    if (NumOps > 4) continue;
+    bool IsCleanUp = (NumOps == 3);
+
+    if (!IsCleanUp)
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getOperand(3)))
+        IsCleanUp = (CI->getZExtValue() == 0);
+
+    if (IsCleanUp)
+      Sels.insert(SI);
+  }
+}
+
+/// FindAllURoRInvokes - Find all URoR invokes in the function.
+void DwarfEHPrepare::
+FindAllURoRInvokes(SmallPtrSet<InvokeInst*, 32> &URoRInvokes) {
+  for (Value::use_iterator
+         I = URoR->use_begin(),
+         E = URoR->use_end(); I != E; ++I) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+      URoRInvokes.insert(II);
+  }
+}
+
+/// CleanupSelectors - Any remaining eh.selector intrinsic calls which still use
+/// the ".llvm.eh.catch.all.value" call need to convert to using it's
+/// initializer instead.
+bool DwarfEHPrepare::CleanupSelectors() {
+  if (!EHCatchAllValue) return false;
+
+  if (!SelectorIntrinsic) {
+    SelectorIntrinsic =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_selector);
+    if (!SelectorIntrinsic) return false;
+  }
+
+  bool Changed = false;
+  for (Value::use_iterator
+         I = SelectorIntrinsic->use_begin(),
+         E = SelectorIntrinsic->use_end(); I != E; ++I) {
+    IntrinsicInst *Sel = dyn_cast<IntrinsicInst>(I);
+    if (!Sel || Sel->getParent()->getParent() != F) continue;
+
+    // Index of the ".llvm.eh.catch.all.value" variable.
+    unsigned OpIdx = Sel->getNumOperands() - 1;
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(Sel->getOperand(OpIdx));
+    if (GV != EHCatchAllValue) continue;
+    Sel->setOperand(OpIdx, EHCatchAllValue->getInitializer());
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// FindSelectorAndURoR - Find the eh.selector call associated with the
+/// eh.exception call. And indicate if there is a URoR "invoke" associated with
+/// the eh.exception call. This recursively looks past instructions which don't
+/// change the EH pointer value, like casts or PHI nodes.
+bool
+DwarfEHPrepare::FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
+                                    SmallPtrSet<IntrinsicInst*, 8> &SelCalls) {
+  SmallPtrSet<PHINode*, 32> SeenPHIs;
+  bool Changed = false;
+
+ restart:
+  for (Value::use_iterator
+         I = Inst->use_begin(), E = Inst->use_end(); I != E; ++I) {
+    Instruction *II = dyn_cast<Instruction>(I);
+    if (!II || II->getParent()->getParent() != F) continue;
+    
+    if (IntrinsicInst *Sel = dyn_cast<IntrinsicInst>(II)) {
+      if (Sel->getIntrinsicID() == Intrinsic::eh_selector)
+        SelCalls.insert(Sel);
+    } else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(II)) {
+      if (Invoke->getCalledFunction() == URoR)
+        URoRInvoke = true;
+    } else if (CastInst *CI = dyn_cast<CastInst>(II)) {
+      Changed |= FindSelectorAndURoR(CI, URoRInvoke, SelCalls);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(II)) {
+      if (!PromoteStoreInst(SI)) continue;
+      Changed = true;
+      SeenPHIs.clear();
+      goto restart;             // Uses may have changed, restart loop.
+    } else if (PHINode *PN = dyn_cast<PHINode>(II)) {
+      if (SeenPHIs.insert(PN))
+        // Don't process a PHI node more than once.
+        Changed |= FindSelectorAndURoR(PN, URoRInvoke, SelCalls);
+    }
+  }
+
+  return Changed;
+}
+
+/// HandleURoRInvokes - Handle invokes of "_Unwind_Resume_or_Rethrow" calls. The
+/// "unwind" part of these invokes jump to a landing pad within the current
+/// function. This is a candidate to merge the selector associated with the URoR
+/// invoke with the one from the URoR's landing pad.
+bool DwarfEHPrepare::HandleURoRInvokes() {
+  if (!DT) return CleanupSelectors(); // We require DominatorTree information.
+
+  if (!EHCatchAllValue) {
+    EHCatchAllValue =
+      F->getParent()->getNamedGlobal(".llvm.eh.catch.all.value");
+    if (!EHCatchAllValue) return false;
+  }
+
+  if (!SelectorIntrinsic) {
+    SelectorIntrinsic =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_selector);
+    if (!SelectorIntrinsic) return false;
+  }
+
+  if (!URoR) {
+    URoR = F->getParent()->getFunction("_Unwind_Resume_or_Rethrow");
+    if (!URoR) return CleanupSelectors();
+  }
+
+  SmallPtrSet<IntrinsicInst*, 32> Sels;
+  SmallPtrSet<InvokeInst*, 32> URoRInvokes;
+  FindAllCleanupSelectors(Sels);
+  FindAllURoRInvokes(URoRInvokes);
+
+  SmallPtrSet<IntrinsicInst*, 32> SelsToConvert;
+
+  for (SmallPtrSet<IntrinsicInst*, 32>::iterator
+         SI = Sels.begin(), SE = Sels.end(); SI != SE; ++SI) {
+    const BasicBlock *SelBB = (*SI)->getParent();
+    for (SmallPtrSet<InvokeInst*, 32>::iterator
+           UI = URoRInvokes.begin(), UE = URoRInvokes.end(); UI != UE; ++UI) {
+      const BasicBlock *URoRBB = (*UI)->getParent();
+      if (SelBB == URoRBB || DT->dominates(SelBB, URoRBB)) {
+        SelsToConvert.insert(*SI);
+        break;
+      }
+    }
+  }
+
+  bool Changed = false;
+
+  if (Sels.size() != SelsToConvert.size()) {
+    // If we haven't been able to convert all of the clean-up selectors, then
+    // loop through the slow way to see if they still need to be converted.
+    if (!ExceptionValueIntrinsic) {
+      ExceptionValueIntrinsic =
+        Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_exception);
+      if (!ExceptionValueIntrinsic) return CleanupSelectors();
+    }
+
+    for (Value::use_iterator
+           I = ExceptionValueIntrinsic->use_begin(),
+           E = ExceptionValueIntrinsic->use_end(); I != E; ++I) {
+      IntrinsicInst *EHPtr = dyn_cast<IntrinsicInst>(I);
+      if (!EHPtr || EHPtr->getParent()->getParent() != F) continue;
+
+      Changed |= PromoteEHPtrStore(EHPtr);
+
+      bool URoRInvoke = false;
+      SmallPtrSet<IntrinsicInst*, 8> SelCalls;
+      Changed |= FindSelectorAndURoR(EHPtr, URoRInvoke, SelCalls);
+
+      if (URoRInvoke) {
+        // This EH pointer is being used by an invoke of an URoR instruction and
+        // an eh.selector intrinsic call. If the eh.selector is a 'clean-up', we
+        // need to convert it to a 'catch-all'.
+        for (SmallPtrSet<IntrinsicInst*, 8>::iterator
+               SI = SelCalls.begin(), SE = SelCalls.end(); SI != SE; ++SI) {
+          IntrinsicInst *II = *SI;
+          unsigned NumOps = II->getNumOperands();
+
+          if (NumOps <= 4) {
+            bool IsCleanUp = (NumOps == 3);
+
+            if (!IsCleanUp)
+              if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getOperand(3)))
+                IsCleanUp = (CI->getZExtValue() == 0);
+
+            if (IsCleanUp)
+              SelsToConvert.insert(II);
+          }
+        }
+      }
+    }
+  }
+
+  if (!SelsToConvert.empty()) {
+    // Convert all clean-up eh.selectors, which are associated with "invokes" of
+    // URoR calls, into catch-all eh.selectors.
+    Changed = true;
+
+    for (SmallPtrSet<IntrinsicInst*, 8>::iterator
+           SI = SelsToConvert.begin(), SE = SelsToConvert.end();
+         SI != SE; ++SI) {
+      IntrinsicInst *II = *SI;
+      SmallVector<Value*, 8> Args;
+
+      // Use the exception object pointer and the personality function
+      // from the original selector.
+      Args.push_back(II->getOperand(1)); // Exception object pointer.
+      Args.push_back(II->getOperand(2)); // Personality function.
+      Args.push_back(EHCatchAllValue->getInitializer()); // Catch-all indicator.
+
+      CallInst *NewSelector =
+        CallInst::Create(SelectorIntrinsic, Args.begin(), Args.end(),
+                         "eh.sel.catch.all", II);
+
+      NewSelector->setTailCall(II->isTailCall());
+      NewSelector->setAttributes(II->getAttributes());
+      NewSelector->setCallingConv(II->getCallingConv());
+
+      II->replaceAllUsesWith(NewSelector);
+      II->eraseFromParent();
+    }
+  }
+
+  Changed |= CleanupSelectors();
+  return Changed;
+}
+
 /// NormalizeLandingPads - Normalize and discover landing pads, noting them
 /// in the LandingPads set.  A landing pad is normal if the only CFG edges
 /// that end at it are unwind edges from invoke instructions. If we inlined
@@ -422,6 +728,8 @@ bool DwarfEHPrepare::runOnFunction(Function &Fn) {
   if (!CompileFast)
     Changed |= PromoteStackTemporaries();
 
+  Changed |= HandleURoRInvokes();
+
   LandingPads.clear();
 
   return Changed;
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index e207f607dad2..025ad0538f2c 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -303,9 +303,7 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
           // otherwise mark it as ~1U so it can be nuked later.
           if (ValNo->id == getNumValNums()-1) {
             do {
-              VNInfo *VNI = valnos.back();
               valnos.pop_back();
-              VNI->~VNInfo();
             } while (!valnos.empty() && valnos.back()->isUnused());
           } else {
             ValNo->setIsUnused(true);
@@ -351,9 +349,7 @@ void LiveInterval::removeValNo(VNInfo *ValNo) {
   // otherwise mark it as ~1U so it can be nuked later.
   if (ValNo->id == getNumValNums()-1) {
     do {
-      VNInfo *VNI = valnos.back();
       valnos.pop_back();
-      VNI->~VNInfo();
     } while (!valnos.empty() && valnos.back()->isUnused());
   } else {
     ValNo->setIsUnused(true);
@@ -579,9 +575,7 @@ void LiveInterval::MergeValueInAsValue(
         // mark it as ~1U so it can be nuked later.
         if (V1->id == getNumValNums()-1) {
           do {
-            VNInfo *VNI = valnos.back();
             valnos.pop_back();
-            VNI->~VNInfo();
           } while (!valnos.empty() && valnos.back()->isUnused());
         } else {
           V1->setIsUnused(true);
@@ -597,7 +591,7 @@ void LiveInterval::MergeValueInAsValue(
 /// used with an unknown definition value.
 void LiveInterval::MergeInClobberRanges(LiveIntervals &li_,
                                         const LiveInterval &Clobbers,
-                                        BumpPtrAllocator &VNInfoAllocator) {
+                                        VNInfo::Allocator &VNInfoAllocator) {
   if (Clobbers.empty()) return;
   
   DenseMap<VNInfo*, VNInfo*> ValNoMaps;
@@ -658,14 +652,13 @@ void LiveInterval::MergeInClobberRanges(LiveIntervals &li_,
   if (UnusedValNo) {
     // Delete the last unused val#.
     valnos.pop_back();
-    UnusedValNo->~VNInfo();
   }
 }
 
 void LiveInterval::MergeInClobberRange(LiveIntervals &li_,
                                        SlotIndex Start,
                                        SlotIndex End,
-                                       BumpPtrAllocator &VNInfoAllocator) {
+                                       VNInfo::Allocator &VNInfoAllocator) {
   // Find a value # to use for the clobber ranges.  If there is already a value#
   // for unknown values, use it.
   VNInfo *ClobberValNo =
@@ -749,9 +742,7 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   // ~1U so it can be nuked later.
   if (V1->id == getNumValNums()-1) {
     do {
-      VNInfo *VNI = valnos.back();
       valnos.pop_back();
-      VNI->~VNInfo();
     } while (valnos.back()->isUnused());
   } else {
     V1->setIsUnused(true);
@@ -762,7 +753,7 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
 
 void LiveInterval::Copy(const LiveInterval &RHS,
                         MachineRegisterInfo *MRI,
-                        BumpPtrAllocator &VNInfoAllocator) {
+                        VNInfo::Allocator &VNInfoAllocator) {
   ranges.clear();
   valnos.clear();
   std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(RHS.reg);
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index b3e921689c7a..23cff0786a5e 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -91,7 +91,7 @@ void LiveIntervals::releaseMemory() {
   r2iMap_.clear();
 
   // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
-  VNInfoAllocator.Reset();
+  VNInfoAllocator.DestroyAll();
   while (!CloneMIs.empty()) {
     MachineInstr *MI = CloneMIs.back();
     CloneMIs.pop_back();
@@ -819,8 +819,9 @@ bool LiveIntervals::isReMaterializable(const LiveInterval &li,
   unsigned ImpUse = getReMatImplicitUse(li, MI);
   if (ImpUse) {
     const LiveInterval &ImpLi = getInterval(ImpUse);
-    for (MachineRegisterInfo::use_iterator ri = mri_->use_begin(li.reg),
-           re = mri_->use_end(); ri != re; ++ri) {
+    for (MachineRegisterInfo::use_nodbg_iterator
+           ri = mri_->use_nodbg_begin(li.reg), re = mri_->use_nodbg_end();
+         ri != re; ++ri) {
       MachineInstr *UseMI = &*ri;
       SlotIndex UseIdx = getInstructionIndex(UseMI);
       if (li.FindLiveRangeContaining(UseIdx)->valno != ValNo)
@@ -1052,7 +1053,7 @@ rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
       // all of its uses are rematerialized, simply delete it.
       if (MI == ReMatOrigDefMI && CanDelete) {
         DEBUG(dbgs() << "\t\t\t\tErasing re-materializable def: "
-                     << MI << '\n');
+                     << *MI << '\n');
         RemoveMachineInstrFromMaps(MI);
         vrm.RemoveMachineInstrFromMaps(MI);
         MI->eraseFromParent();
@@ -1520,6 +1521,12 @@ LiveIntervals::handleSpilledImpDefs(const LiveInterval &li, VirtRegMap &vrm,
     MachineOperand &O = ri.getOperand();
     MachineInstr *MI = &*ri;
     ++ri;
+    if (MI->isDebugValue()) {
+      // Remove debug info for now.
+      O.setReg(0U);
+      DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
+      continue;
+    }
     if (O.isDef()) {
       assert(MI->isImplicitDef() &&
              "Register def was not rewritten?");
@@ -2012,6 +2019,8 @@ unsigned LiveIntervals::getNumConflictsWithPhysReg(const LiveInterval &li,
          E = mri_->reg_end(); I != E; ++I) {
     MachineOperand &O = I.getOperand();
     MachineInstr *MI = O.getParent();
+    if (MI->isDebugValue())
+      continue;
     SlotIndex Index = getInstructionIndex(MI);
     if (pli.liveAt(Index))
       ++NumConflicts;
@@ -2052,7 +2061,7 @@ bool LiveIntervals::spillPhysRegAroundRegDefsUses(const LiveInterval &li,
          E = mri_->reg_end(); I != E; ++I) {
     MachineOperand &O = I.getOperand();
     MachineInstr *MI = O.getParent();
-    if (SeenMIs.count(MI))
+    if (MI->isDebugValue() || SeenMIs.count(MI))
       continue;
     SeenMIs.insert(MI);
     SlotIndex Index = getInstructionIndex(MI);
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index d2f3775288c5..798b9b939cd3 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -36,7 +36,7 @@ void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
 
 void LiveStacks::releaseMemory() {
   // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
-  VNInfoAllocator.Reset();
+  VNInfoAllocator.DestroyAll();
   S2IMap.clear();
   S2RCMap.clear();
 }
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 519990e04a2c..ca8ecff8e61d 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -556,17 +556,21 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
       if (MI->isPHI())
         NumOperandsToProcess = 1;
 
+      // Clear kill and dead markers. LV will recompute them.
       SmallVector<unsigned, 4> UseRegs;
       SmallVector<unsigned, 4> DefRegs;
       for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
-        const MachineOperand &MO = MI->getOperand(i);
+        MachineOperand &MO = MI->getOperand(i);
         if (!MO.isReg() || MO.getReg() == 0)
           continue;
         unsigned MOReg = MO.getReg();
-        if (MO.isUse())
+        if (MO.isUse()) {
+          MO.setIsKill(false);
           UseRegs.push_back(MOReg);
-        if (MO.isDef())
+        } else /*MO.isDef()*/ {
+          MO.setIsDead(false);
           DefRegs.push_back(MOReg);
+        }
       }
 
       // Process all uses.
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index fc8ae5f705d6..bd0ccb40f53b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/LeakDetector.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,9 +46,9 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
   const MachineFunction *MF = getParent();
   MCContext &Ctx = MF->getContext();
   const char *Prefix = Ctx.getAsmInfo().getPrivateGlobalPrefix();
-  return Ctx.GetOrCreateTemporarySymbol(Twine(Prefix) + "BB" +
-                                        Twine(MF->getFunctionNumber()) + "_" +
-                                        Twine(getNumber()));
+  return Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
+                               Twine(MF->getFunctionNumber()) + "_" +
+                               Twine(getNumber()));
 }
 
 
@@ -459,54 +460,41 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
   //    conditional branch followed by an unconditional branch. DestA is the
   //    'true' destination and DestB is the 'false' destination.
 
-  bool MadeChange = false;
-  bool AddedFallThrough = false;
+  bool Changed = false;
 
   MachineFunction::iterator FallThru =
     llvm::next(MachineFunction::iterator(this));
-  
-  if (isCond) {
-    // If this block ends with a conditional branch that falls through to its
-    // successor, set DestB as the successor.
-    if (DestB == 0 && FallThru != getParent()->end()) {
+
+  if (DestA == 0 && DestB == 0) {
+    // Block falls through to successor.
+    DestA = FallThru;
+    DestB = FallThru;
+  } else if (DestA != 0 && DestB == 0) {
+    if (isCond)
+      // Block ends in conditional jump that falls through to successor.
       DestB = FallThru;
-      AddedFallThrough = true;
-    }
   } else {
-    // If this is an unconditional branch with no explicit dest, it must just be
-    // a fallthrough into DestA.
-    if (DestA == 0 && FallThru != getParent()->end()) {
-      DestA = FallThru;
-      AddedFallThrough = true;
-    }
+    assert(DestA && DestB && isCond &&
+           "CFG in a bad state. Cannot correct CFG edges");
   }
-  
+
+  // Remove superfluous edges. I.e., those which aren't destinations of this
+  // basic block, duplicate edges, or landing pads.
+  SmallPtrSet<const MachineBasicBlock*, 8> SeenMBBs;
   MachineBasicBlock::succ_iterator SI = succ_begin();
-  MachineBasicBlock *OrigDestA = DestA, *OrigDestB = DestB;
   while (SI != succ_end()) {
     const MachineBasicBlock *MBB = *SI;
-    if (MBB == DestA) {
-      DestA = 0;
-      ++SI;
-    } else if (MBB == DestB) {
-      DestB = 0;
-      ++SI;
-    } else if (MBB->isLandingPad() && 
-               MBB != OrigDestA && MBB != OrigDestB) {
-      ++SI;
-    } else {
-      // Otherwise, this is a superfluous edge, remove it.
+    if (!SeenMBBs.insert(MBB) ||
+        (MBB != DestA && MBB != DestB && !MBB->isLandingPad())) {
+      // This is a superfluous edge, remove it.
       SI = removeSuccessor(SI);
-      MadeChange = true;
+      Changed = true;
+    } else {
+      ++SI;
     }
   }
 
-  if (!AddedFallThrough)
-    assert(DestA == 0 && DestB == 0 && "MachineCFG is missing edges!");
-  else if (isCond)
-    assert(DestA == 0 && "MachineCFG is missing edges!");
-
-  return MadeChange;
+  return Changed;
 }
 
 /// findDebugLoc - find the next valid DebugLoc starting at MBBI, skipping
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 91d363588295..597d51d4f370 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -117,17 +117,15 @@ bool MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
                                         MachineBasicBlock::const_iterator I,
                                         MachineBasicBlock::const_iterator E) {
   unsigned LookAheadLeft = 5;
-  while (LookAheadLeft--) {
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I->isDebugValue())
+      ++I;
+
     if (I == E)
       // Reached end of block, register is obviously dead.
       return true;
 
-    if (I->isDebugValue()) {
-      // These must not count against the limit.
-      ++LookAheadLeft;
-      ++I;
-      continue;
-    }
     bool SeenDef = false;
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = I->getOperand(i);
@@ -143,6 +141,8 @@ bool MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
       // See a def of Reg (or an alias) before encountering any use, it's 
       // trivially dead.
       return true;
+
+    --LookAheadLeft;
     ++I;
   }
   return false;
@@ -294,8 +294,12 @@ bool MachineCSE::ProcessBlock(MachineDomTreeNode *Node) {
     bool FoundCSE = VNT.count(MI);
     if (!FoundCSE) {
       // Look for trivial copy coalescing opportunities.
-      if (PerformTrivialCoalescing(MI, MBB))
+      if (PerformTrivialCoalescing(MI, MBB)) {
+        // After coalescing MI itself may become a copy.
+        if (isCopy(MI, TII))
+          continue;
         FoundCSE = VNT.count(MI);
+      }
     }
     // FIXME: commute commutable instructions?
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 5772b2f2d189..f6cc71f3a437 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -460,9 +460,7 @@ MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
   SmallString<60> Name;
   raw_svector_ostream(Name)
     << Prefix << "JTI" << getFunctionNumber() << '_' << JTI;
-  if (isLinkerPrivate)
-    return Ctx.GetOrCreateSymbol(Name.str());
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
 
 
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index af48e9ebb5b3..ad4f01b7a9ad 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -44,6 +44,10 @@ public:
   MMIAddrLabelMapCallbackPtr() : Map(0) {}
   MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(0) {}
   
+  void setPtr(BasicBlock *BB) {
+    ValueHandleBase::operator=(BB);
+  }
+    
   void setMap(MMIAddrLabelMap *map) { Map = map; }
   
   virtual void deleted();
@@ -209,7 +213,7 @@ void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
 
   // If New is not address taken, just move our symbol over to it.
   if (NewEntry.Symbols.isNull()) {
-    BBCallbacks[OldEntry.Index] = New;    // Update the callback.
+    BBCallbacks[OldEntry.Index].setPtr(New);    // Update the callback.
     NewEntry = OldEntry;     // Set New's entry.
     return;
   }
diff --git a/lib/CodeGen/OptimizeExts.cpp b/lib/CodeGen/OptimizeExts.cpp
index acb6869ed175..41fc20407441 100644
--- a/lib/CodeGen/OptimizeExts.cpp
+++ b/lib/CodeGen/OptimizeExts.cpp
@@ -73,6 +73,9 @@ FunctionPass *llvm::createOptimizeExtsPass() { return new OptimizeExts(); }
 /// the source, and if the source value is preserved as a sub-register of
 /// the result, then replace all reachable uses of the source with the subreg
 /// of the result.
+/// Do not generate an EXTRACT that is used only in a debug use, as this
+/// changes the code.  Since this code does not currently share EXTRACTs, just
+/// ignore all debug uses.
 bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                                  SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
   bool Changed = false;
@@ -84,17 +87,17 @@ bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
         TargetRegisterInfo::isPhysicalRegister(SrcReg))
       return false;
 
-    MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg);
-    if (++UI == MRI->use_end())
+    MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg);
+    if (++UI == MRI->use_nodbg_end())
       // No other uses.
       return false;
 
     // Ok, the source has other uses. See if we can replace the other uses
     // with use of the result of the extension.
     SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
-    UI = MRI->use_begin(DstReg);
-    for (MachineRegisterInfo::use_iterator UE = MRI->use_end(); UI != UE;
-         ++UI)
+    UI = MRI->use_nodbg_begin(DstReg);
+    for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+         UI != UE; ++UI)
       ReachedBBs.insert(UI->getParent());
 
     bool ExtendLife = true;
@@ -103,9 +106,9 @@ bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
     // Uses that the result of the instruction can reach.
     SmallVector<MachineOperand*, 8> ExtendedUses;
 
-    UI = MRI->use_begin(SrcReg);
-    for (MachineRegisterInfo::use_iterator UE = MRI->use_end(); UI != UE;
-         ++UI) {
+    UI = MRI->use_nodbg_begin(SrcReg);
+    for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+         UI != UE; ++UI) {
       MachineOperand &UseMO = UI.getOperand();
       MachineInstr *UseMI = &*UI;
       if (UseMI == MI)
@@ -147,9 +150,9 @@ bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
       // Look for PHI uses of the extended result, we don't want to extend the
       // liveness of a PHI input. It breaks all kinds of assumptions down
       // stream. A PHI use is expected to be the kill of its source values.
-      UI = MRI->use_begin(DstReg);
-      for (MachineRegisterInfo::use_iterator UE = MRI->use_end(); UI != UE;
-           ++UI)
+      UI = MRI->use_nodbg_begin(DstReg);
+      for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+           UI != UE; ++UI)
         if (UI->isPHI())
           PHIBBs.insert(UI->getParent());
 
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 8bbe0a725286..f0057ce8ef8f 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -74,7 +74,7 @@ bool llvm::PHIElimination::runOnMachineFunction(MachineFunction &Fn) {
          E = ImpDefs.end(); I != E; ++I) {
     MachineInstr *DefMI = *I;
     unsigned DefReg = DefMI->getOperand(0).getReg();
-    if (MRI->use_empty(DefReg))
+    if (MRI->use_nodbg_empty(DefReg))
       DefMI->eraseFromParent();
   }
 
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
index 70e91aa3bd09..2d49beb7d761 100644
--- a/lib/CodeGen/PreAllocSplitting.cpp
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -665,7 +665,7 @@ PreAllocSplitting::PerformPHIConstructionFallBack(MachineBasicBlock::iterator Us
 
 /// ReconstructLiveInterval - Recompute a live interval from scratch.
 void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
-  BumpPtrAllocator& Alloc = LIs->getVNInfoAllocator();
+  VNInfo::Allocator& Alloc = LIs->getVNInfoAllocator();
   
   // Clear the old ranges and valnos;
   LI->clear();
diff --git a/lib/CodeGen/RegAllocLocal.cpp b/lib/CodeGen/RegAllocLocal.cpp
index 2c69065b1427..0ef041e76b20 100644
--- a/lib/CodeGen/RegAllocLocal.cpp
+++ b/lib/CodeGen/RegAllocLocal.cpp
@@ -118,8 +118,8 @@ namespace {
 
     bool isVirtRegModified(unsigned Reg) const {
       assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
-      assert(Reg - TargetRegisterInfo::FirstVirtualRegister < VirtRegModified.size()
-             && "Illegal virtual register!");
+      assert(Reg - TargetRegisterInfo::FirstVirtualRegister <
+             VirtRegModified.size() && "Illegal virtual register!");
       return VirtRegModified[Reg - TargetRegisterInfo::FirstVirtualRegister];
     }
 
@@ -135,15 +135,16 @@ namespace {
       if (PhysRegsUseOrder.empty() ||
           PhysRegsUseOrder.back() == Reg) return;  // Already most recently used
 
-      for (unsigned i = PhysRegsUseOrder.size(); i != 0; --i)
-        if (areRegsEqual(Reg, PhysRegsUseOrder[i-1])) {
-          unsigned RegMatch = PhysRegsUseOrder[i-1];       // remove from middle
-          PhysRegsUseOrder.erase(PhysRegsUseOrder.begin()+i-1);
-          // Add it to the end of the list
-          PhysRegsUseOrder.push_back(RegMatch);
-          if (RegMatch == Reg)
-            return;    // Found an exact match, exit early
-        }
+      for (unsigned i = PhysRegsUseOrder.size(); i != 0; --i) {
+        unsigned RegMatch = PhysRegsUseOrder[i-1];       // remove from middle
+        if (!areRegsEqual(Reg, RegMatch)) continue;
+        
+        PhysRegsUseOrder.erase(PhysRegsUseOrder.begin()+i-1);
+        // Add it to the end of the list
+        PhysRegsUseOrder.push_back(RegMatch);
+        if (RegMatch == Reg)
+          return;    // Found an exact match, exit early
+      }
     }
 
   public:
@@ -267,7 +268,7 @@ int RALocal::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
   int FrameIdx = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
                                                             RC->getAlignment());
 
-  // Assign the slot...
+  // Assign the slot.
   StackSlotForVirtReg[VirtReg] = FrameIdx;
   return FrameIdx;
 }
@@ -337,15 +338,19 @@ void RALocal::spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
     assert(PhysRegsUsed[PhysReg] != -2 && "Non allocable reg used!");
     if (PhysRegsUsed[PhysReg] || !OnlyVirtRegs)
       spillVirtReg(MBB, I, PhysRegsUsed[PhysReg], PhysReg);
-  } else {
-    // If the selected register aliases any other registers, we must make
-    // sure that one of the aliases isn't alive.
-    for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-         *AliasSet; ++AliasSet)
-      if (PhysRegsUsed[*AliasSet] != -1 &&     // Spill aliased register.
-          PhysRegsUsed[*AliasSet] != -2)       // If allocatable.
-          if (PhysRegsUsed[*AliasSet])
-            spillVirtReg(MBB, I, PhysRegsUsed[*AliasSet], *AliasSet);
+    return;
+  }
+  
+  // If the selected register aliases any other registers, we must make
+  // sure that one of the aliases isn't alive.
+  for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
+       *AliasSet; ++AliasSet) {
+    if (PhysRegsUsed[*AliasSet] == -1 ||     // Spill aliased register.
+        PhysRegsUsed[*AliasSet] == -2)       // If allocatable.
+      continue;
+  
+    if (PhysRegsUsed[*AliasSet])
+      spillVirtReg(MBB, I, PhysRegsUsed[*AliasSet], *AliasSet);
   }
 }
 
@@ -410,59 +415,64 @@ unsigned RALocal::getReg(MachineBasicBlock &MBB, MachineInstr *I,
   // First check to see if we have a free register of the requested type...
   unsigned PhysReg = NoFree ? 0 : getFreeReg(RC);
 
+  if (PhysReg != 0) {
+    // Assign the register.
+    assignVirtToPhysReg(VirtReg, PhysReg);
+    return PhysReg;
+  }    
+    
   // If we didn't find an unused register, scavenge one now!
-  if (PhysReg == 0) {
-    assert(!PhysRegsUseOrder.empty() && "No allocated registers??");
+  assert(!PhysRegsUseOrder.empty() && "No allocated registers??");
 
-    // Loop over all of the preallocated registers from the least recently used
-    // to the most recently used.  When we find one that is capable of holding
-    // our register, use it.
-    for (unsigned i = 0; PhysReg == 0; ++i) {
-      assert(i != PhysRegsUseOrder.size() &&
-             "Couldn't find a register of the appropriate class!");
+  // Loop over all of the preallocated registers from the least recently used
+  // to the most recently used.  When we find one that is capable of holding
+  // our register, use it.
+  for (unsigned i = 0; PhysReg == 0; ++i) {
+    assert(i != PhysRegsUseOrder.size() &&
+           "Couldn't find a register of the appropriate class!");
 
-      unsigned R = PhysRegsUseOrder[i];
+    unsigned R = PhysRegsUseOrder[i];
 
-      // We can only use this register if it holds a virtual register (ie, it
-      // can be spilled).  Do not use it if it is an explicitly allocated
-      // physical register!
-      assert(PhysRegsUsed[R] != -1 &&
-             "PhysReg in PhysRegsUseOrder, but is not allocated?");
-      if (PhysRegsUsed[R] && PhysRegsUsed[R] != -2) {
-        // If the current register is compatible, use it.
-        if (RC->contains(R)) {
-          PhysReg = R;
-          break;
-        } else {
-          // If one of the registers aliased to the current register is
-          // compatible, use it.
-          for (const unsigned *AliasIt = TRI->getAliasSet(R);
-               *AliasIt; ++AliasIt) {
-            if (RC->contains(*AliasIt) &&
-                // If this is pinned down for some reason, don't use it.  For
-                // example, if CL is pinned, and we run across CH, don't use
-                // CH as justification for using scavenging ECX (which will
-                // fail).
-                PhysRegsUsed[*AliasIt] != 0 &&
-                
-                // Make sure the register is allocatable.  Don't allocate SIL on
-                // x86-32.
-                PhysRegsUsed[*AliasIt] != -2) {
-              PhysReg = *AliasIt;    // Take an aliased register
-              break;
-            }
-          }
-        }
+    // We can only use this register if it holds a virtual register (ie, it
+    // can be spilled).  Do not use it if it is an explicitly allocated
+    // physical register!
+    assert(PhysRegsUsed[R] != -1 &&
+           "PhysReg in PhysRegsUseOrder, but is not allocated?");
+    if (PhysRegsUsed[R] && PhysRegsUsed[R] != -2) {
+      // If the current register is compatible, use it.
+      if (RC->contains(R)) {
+        PhysReg = R;
+        break;
+      }
+      
+      // If one of the registers aliased to the current register is
+      // compatible, use it.
+      for (const unsigned *AliasIt = TRI->getAliasSet(R);
+           *AliasIt; ++AliasIt) {
+        if (!RC->contains(*AliasIt)) continue;
+        
+        // If this is pinned down for some reason, don't use it.  For
+        // example, if CL is pinned, and we run across CH, don't use
+        // CH as justification for using scavenging ECX (which will
+        // fail).
+        if (PhysRegsUsed[*AliasIt] == 0) continue;
+            
+        // Make sure the register is allocatable.  Don't allocate SIL on
+        // x86-32.
+        if (PhysRegsUsed[*AliasIt] == -2) continue;
+        
+        PhysReg = *AliasIt;    // Take an aliased register
+        break;
       }
     }
-
-    assert(PhysReg && "Physical register not assigned!?!?");
-
-    // At this point PhysRegsUseOrder[i] is the least recently used register of
-    // compatible register class.  Spill it to memory and reap its remains.
-    spillPhysReg(MBB, I, PhysReg);
   }
 
+  assert(PhysReg && "Physical register not assigned!?!?");
+
+  // At this point PhysRegsUseOrder[i] is the least recently used register of
+  // compatible register class.  Spill it to memory and reap its remains.
+  spillPhysReg(MBB, I, PhysReg);
+
   // Now that we know which register we need to assign this to, do it now!
   assignVirtToPhysReg(VirtReg, PhysReg);
   return PhysReg;
@@ -543,17 +553,17 @@ MachineInstr *RALocal::reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
   }
   for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
        *SubRegs; ++SubRegs) {
-    if (!ReloadedRegs.insert(*SubRegs)) {
-      std::string msg;
-      raw_string_ostream Msg(msg);
-      Msg << "Ran out of registers during register allocation!";
-      if (MI->isInlineAsm()) {
-        Msg << "\nPlease check your inline asm statement for invalid "
-             << "constraints:\n";
-        MI->print(Msg, TM);
-      }
-      llvm_report_error(Msg.str());
+    if (ReloadedRegs.insert(*SubRegs)) continue;
+    
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Ran out of registers during register allocation!";
+    if (MI->isInlineAsm()) {
+      Msg << "\nPlease check your inline asm statement for invalid "
+           << "constraints:\n";
+      MI->print(Msg, TM);
     }
+    llvm_report_error(Msg.str());
   }
 
   return MI;
@@ -563,7 +573,7 @@ MachineInstr *RALocal::reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
 /// read/mod/write register, i.e. update partial register.
 static bool isReadModWriteImplicitKill(MachineInstr *MI, unsigned Reg) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand& MO = MI->getOperand(i);
+    MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
         MO.isDef() && !MO.isDead())
       return true;
@@ -575,7 +585,7 @@ static bool isReadModWriteImplicitKill(MachineInstr *MI, unsigned Reg) {
 /// read/mod/write register, i.e. update partial register.
 static bool isReadModWriteImplicitDef(MachineInstr *MI, unsigned Reg) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand& MO = MI->getOperand(i);
+    MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
         !MO.isDef() && MO.isKill())
       return true;
@@ -606,7 +616,7 @@ static bool precedes(MachineBasicBlock::iterator A,
 /// ComputeLocalLiveness - Computes liveness of registers within a basic
 /// block, setting the killed/dead flags as appropriate.
 void RALocal::ComputeLocalLiveness(MachineBasicBlock& MBB) {
-  MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   // Keep track of the most recently seen previous use or def of each reg, 
   // so that we can update them with dead/kill markers.
   DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > LastUseDef;
@@ -614,58 +624,60 @@ void RALocal::ComputeLocalLiveness(MachineBasicBlock& MBB) {
        I != E; ++I) {
     if (I->isDebugValue())
       continue;
+    
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      MachineOperand& MO = I->getOperand(i);
+      MachineOperand &MO = I->getOperand(i);
       // Uses don't trigger any flags, but we need to save
       // them for later.  Also, we have to process these
       // _before_ processing the defs, since an instr
       // uses regs before it defs them.
-      if (MO.isReg() && MO.getReg() && MO.isUse()) {
-        LastUseDef[MO.getReg()] = std::make_pair(I, i);
+      if (!MO.isReg() || !MO.getReg() || !MO.isUse())
+        continue;
+      
+      LastUseDef[MO.getReg()] = std::make_pair(I, i);
+      
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue;
+      
+      const unsigned *Aliases = TRI->getAliasSet(MO.getReg());
+      if (Aliases == 0)
+        continue;
+      
+      while (*Aliases) {
+        DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
+          alias = LastUseDef.find(*Aliases);
         
+        if (alias != LastUseDef.end() && alias->second.first != I)
+          LastUseDef[*Aliases] = std::make_pair(I, i);
         
-        if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue;
-        
-        const unsigned* Aliases = TRI->getAliasSet(MO.getReg());
-        if (Aliases) {
-          while (*Aliases) {
-            DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-              alias = LastUseDef.find(*Aliases);
-            
-            if (alias != LastUseDef.end() && alias->second.first != I)
-              LastUseDef[*Aliases] = std::make_pair(I, i);
-            
-            ++Aliases;
-          }
-        }
+        ++Aliases;
       }
     }
     
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      MachineOperand& MO = I->getOperand(i);
+      MachineOperand &MO = I->getOperand(i);
       // Defs others than 2-addr redefs _do_ trigger flag changes:
       //   - A def followed by a def is dead
       //   - A use followed by a def is a kill
-      if (MO.isReg() && MO.getReg() && MO.isDef()) {
-        DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-          last = LastUseDef.find(MO.getReg());
-        if (last != LastUseDef.end()) {
-          // Check if this is a two address instruction.  If so, then
-          // the def does not kill the use.
-          if (last->second.first == I &&
-              I->isRegTiedToUseOperand(i))
-            continue;
-          
-          MachineOperand& lastUD =
-                      last->second.first->getOperand(last->second.second);
-          if (lastUD.isDef())
-            lastUD.setIsDead(true);
-          else
-            lastUD.setIsKill(true);
-        }
+      if (!MO.isReg() || !MO.getReg() || !MO.isDef()) continue;
+      
+      DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
+        last = LastUseDef.find(MO.getReg());
+      if (last != LastUseDef.end()) {
+        // Check if this is a two address instruction.  If so, then
+        // the def does not kill the use.
+        if (last->second.first == I &&
+            I->isRegTiedToUseOperand(i))
+          continue;
         
-        LastUseDef[MO.getReg()] = std::make_pair(I, i);
+        MachineOperand &lastUD =
+                    last->second.first->getOperand(last->second.second);
+        if (lastUD.isDef())
+          lastUD.setIsDead(true);
+        else
+          lastUD.setIsKill(true);
       }
+      
+      LastUseDef[MO.getReg()] = std::make_pair(I, i);
     }
   }
   
@@ -687,9 +699,9 @@ void RALocal::ComputeLocalLiveness(MachineBasicBlock& MBB) {
   // in the block and determine if it is dead.
   for (DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
        I = LastUseDef.begin(), E = LastUseDef.end(); I != E; ++I) {
-    MachineInstr* MI = I->second.first;
+    MachineInstr *MI = I->second.first;
     unsigned idx = I->second.second;
-    MachineOperand& MO = MI->getOperand(idx);
+    MachineOperand &MO = MI->getOperand(idx);
     
     bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(MO.getReg());
     
@@ -712,20 +724,21 @@ void RALocal::ComputeLocalLiveness(MachineBasicBlock& MBB) {
         // Two cases:
         // - used in another block
         // - used in the same block before it is defined (loop)
-        if (UI->getParent() != &MBB ||
-            (MO.isDef() && UI.getOperand().isUse() && precedes(&*UI, MI))) {
-          if (UI->isDebugValue()) {
-            UsedByDebugValueOnly = true;
-            continue;
-          }
-
-          // A non-DBG_VALUE use means we can leave DBG_VALUE uses alone.
-          UsedInMultipleBlocks.set(MO.getReg() - 
-                                   TargetRegisterInfo::FirstVirtualRegister);
-          usedOutsideBlock = true;
-          UsedByDebugValueOnly = false;
-          break;
+        if (UI->getParent() == &MBB &&
+            !(MO.isDef() && UI.getOperand().isUse() && precedes(&*UI, MI)))
+          continue;
+        
+        if (UI->isDebugValue()) {
+          UsedByDebugValueOnly = true;
+          continue;
         }
+
+        // A non-DBG_VALUE use means we can leave DBG_VALUE uses alone.
+        UsedInMultipleBlocks.set(MO.getReg() - 
+                                 TargetRegisterInfo::FirstVirtualRegister);
+        usedOutsideBlock = true;
+        UsedByDebugValueOnly = false;
+        break;
       }
 
       if (UsedByDebugValueOnly)
@@ -770,11 +783,11 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
     AddToPhysRegsUseOrder(Reg); 
     for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
          *SubRegs; ++SubRegs) {
-      if (PhysRegsUsed[*SubRegs] != -2) {
-        AddToPhysRegsUseOrder(*SubRegs); 
-        PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-        MF->getRegInfo().setPhysRegUsed(*SubRegs);
-      }
+      if (PhysRegsUsed[*SubRegs] == -2) continue;
+      
+      AddToPhysRegsUseOrder(*SubRegs); 
+      PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+      MF->getRegInfo().setPhysRegUsed(*SubRegs);
     }
   }
   
@@ -813,16 +826,16 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
 
     SmallVector<unsigned, 8> Kills;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand& MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isKill()) {
-        if (!MO.isImplicit())
-          Kills.push_back(MO.getReg());
-        else if (!isReadModWriteImplicitKill(MI, MO.getReg()))
-          // These are extra physical register kills when a sub-register
-          // is defined (def of a sub-register is a read/mod/write of the
-          // larger registers). Ignore.
-          Kills.push_back(MO.getReg());
-      }
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isKill()) continue;
+      
+      if (!MO.isImplicit())
+        Kills.push_back(MO.getReg());
+      else if (!isReadModWriteImplicitKill(MI, MO.getReg()))
+        // These are extra physical register kills when a sub-register
+        // is defined (def of a sub-register is a read/mod/write of the
+        // larger registers). Ignore.
+        Kills.push_back(MO.getReg());
     }
 
     // If any physical regs are earlyclobber, spill any value they might
@@ -830,45 +843,45 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
     // If any virtual regs are earlyclobber, allocate them now (before
     // freeing inputs that are killed).
     if (MI->isInlineAsm()) {
-      for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-        MachineOperand& MO = MI->getOperand(i);
-        if (MO.isReg() && MO.isDef() && MO.isEarlyClobber() &&
-            MO.getReg()) {
-          if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-            unsigned DestVirtReg = MO.getReg();
-            unsigned DestPhysReg;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber() ||
+            !MO.getReg())
+          continue;
+          
+        if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+          unsigned DestVirtReg = MO.getReg();
+          unsigned DestPhysReg;
 
-            // If DestVirtReg already has a value, use it.
-            if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg)))
-              DestPhysReg = getReg(MBB, MI, DestVirtReg);
-            MF->getRegInfo().setPhysRegUsed(DestPhysReg);
-            markVirtRegModified(DestVirtReg);
-            getVirtRegLastUse(DestVirtReg) =
-                   std::make_pair((MachineInstr*)0, 0);
-            DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
-                         << " to %reg" << DestVirtReg << "\n");
-            MO.setReg(DestPhysReg);  // Assign the earlyclobber register
-          } else {
-            unsigned Reg = MO.getReg();
-            if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
-            // These are extra physical register defs when a sub-register
-            // is defined (def of a sub-register is a read/mod/write of the
-            // larger registers). Ignore.
-            if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
+          // If DestVirtReg already has a value, use it.
+          if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg)))
+            DestPhysReg = getReg(MBB, MI, DestVirtReg);
+          MF->getRegInfo().setPhysRegUsed(DestPhysReg);
+          markVirtRegModified(DestVirtReg);
+          getVirtRegLastUse(DestVirtReg) =
+                 std::make_pair((MachineInstr*)0, 0);
+          DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
+                       << " to %reg" << DestVirtReg << "\n");
+          MO.setReg(DestPhysReg);  // Assign the earlyclobber register
+        } else {
+          unsigned Reg = MO.getReg();
+          if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
+          // These are extra physical register defs when a sub-register
+          // is defined (def of a sub-register is a read/mod/write of the
+          // larger registers). Ignore.
+          if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
 
-            MF->getRegInfo().setPhysRegUsed(Reg);
-            spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
-            PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-            AddToPhysRegsUseOrder(Reg); 
+          MF->getRegInfo().setPhysRegUsed(Reg);
+          spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
+          PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+          AddToPhysRegsUseOrder(Reg); 
 
-            for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-                 *SubRegs; ++SubRegs) {
-              if (PhysRegsUsed[*SubRegs] != -2) {
-                MF->getRegInfo().setPhysRegUsed(*SubRegs);
-                PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-                AddToPhysRegsUseOrder(*SubRegs); 
-              }
-            }
+          for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+               *SubRegs; ++SubRegs) {
+            if (PhysRegsUsed[*SubRegs] == -2) continue;
+            MF->getRegInfo().setPhysRegUsed(*SubRegs);
+            PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+            AddToPhysRegsUseOrder(*SubRegs); 
           }
         }
       }
@@ -894,7 +907,7 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
     //
     SmallSet<unsigned, 4> ReloadedRegs;
     for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-      MachineOperand& MO = MI->getOperand(i);
+      MachineOperand &MO = MI->getOperand(i);
       // here we are looking for only used operands (never def&use)
       if (MO.isReg() && !MO.isDef() && MO.getReg() && !MO.isImplicit() &&
           TargetRegisterInfo::isVirtualRegister(MO.getReg()))
@@ -923,18 +936,18 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
                "Silently clearing a virtual register?");
       }
 
-      if (PhysReg) {
-        DEBUG(dbgs() << "  Last use of " << TRI->getName(PhysReg)
-                     << "[%reg" << VirtReg <<"], removing it from live set\n");
-        removePhysReg(PhysReg);
-        for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
-             *SubRegs; ++SubRegs) {
-          if (PhysRegsUsed[*SubRegs] != -2) {
-            DEBUG(dbgs()  << "  Last use of "
-                          << TRI->getName(*SubRegs) << "[%reg" << VirtReg
-                          <<"], removing it from live set\n");
-            removePhysReg(*SubRegs);
-          }
+      if (!PhysReg) continue;
+      
+      DEBUG(dbgs() << "  Last use of " << TRI->getName(PhysReg)
+                   << "[%reg" << VirtReg <<"], removing it from live set\n");
+      removePhysReg(PhysReg);
+      for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
+           *SubRegs; ++SubRegs) {
+        if (PhysRegsUsed[*SubRegs] != -2) {
+          DEBUG(dbgs()  << "  Last use of "
+                        << TRI->getName(*SubRegs) << "[%reg" << VirtReg
+                        <<"], removing it from live set\n");
+          removePhysReg(*SubRegs);
         }
       }
     }
@@ -942,30 +955,31 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
     // Loop over all of the operands of the instruction, spilling registers that
     // are defined, and marking explicit destinations in the PhysRegsUsed map.
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand& MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isDef() && !MO.isImplicit() && MO.getReg() &&
-          !MO.isEarlyClobber() &&
-          TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
-        unsigned Reg = MO.getReg();
-        if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
-        // These are extra physical register defs when a sub-register
-        // is defined (def of a sub-register is a read/mod/write of the
-        // larger registers). Ignore.
-        if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() || MO.isImplicit() || !MO.getReg() ||
+          MO.isEarlyClobber() ||
+          !TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        continue;
+      
+      unsigned Reg = MO.getReg();
+      if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
+      // These are extra physical register defs when a sub-register
+      // is defined (def of a sub-register is a read/mod/write of the
+      // larger registers). Ignore.
+      if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
 
-        MF->getRegInfo().setPhysRegUsed(Reg);
-        spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
-        PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-        AddToPhysRegsUseOrder(Reg); 
+      MF->getRegInfo().setPhysRegUsed(Reg);
+      spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
+      PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+      AddToPhysRegsUseOrder(Reg); 
 
-        for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-             *SubRegs; ++SubRegs) {
-          if (PhysRegsUsed[*SubRegs] != -2) {
-            MF->getRegInfo().setPhysRegUsed(*SubRegs);
-            PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-            AddToPhysRegsUseOrder(*SubRegs); 
-          }
-        }
+      for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+           *SubRegs; ++SubRegs) {
+        if (PhysRegsUsed[*SubRegs] == -2) continue;
+        
+        MF->getRegInfo().setPhysRegUsed(*SubRegs);
+        PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+        AddToPhysRegsUseOrder(*SubRegs); 
       }
     }
 
@@ -982,18 +996,18 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
         MF->getRegInfo().setPhysRegUsed(Reg);
         for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
              *SubRegs; ++SubRegs) {
-          if (PhysRegsUsed[*SubRegs] != -2) {
-            AddToPhysRegsUseOrder(*SubRegs); 
-            PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-            MF->getRegInfo().setPhysRegUsed(*SubRegs);
-          }
+          if (PhysRegsUsed[*SubRegs] == -2) continue;
+          
+          AddToPhysRegsUseOrder(*SubRegs); 
+          PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+          MF->getRegInfo().setPhysRegUsed(*SubRegs);
         }
       }
     }
 
     SmallVector<unsigned, 8> DeadDefs;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand& MO = MI->getOperand(i);
+      MachineOperand &MO = MI->getOperand(i);
       if (MO.isReg() && MO.isDead())
         DeadDefs.push_back(MO.getReg());
     }
@@ -1004,45 +1018,46 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
     // we need to scavenge a register.
     //
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand& MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isDef() && MO.getReg() &&
-          !MO.isEarlyClobber() &&
-          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-        unsigned DestVirtReg = MO.getReg();
-        unsigned DestPhysReg;
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() || !MO.getReg() ||
+          MO.isEarlyClobber() ||
+          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      
+      unsigned DestVirtReg = MO.getReg();
+      unsigned DestPhysReg;
 
-        // If DestVirtReg already has a value, use it.
-        if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg))) {
-          // If this is a copy try to reuse the input as the output;
-          // that will make the copy go away.
-          // If this is a copy, the source reg is a phys reg, and
-          // that reg is available, use that phys reg for DestPhysReg.
-          // If this is a copy, the source reg is a virtual reg, and
-          // the phys reg that was assigned to that virtual reg is now
-          // available, use that phys reg for DestPhysReg.  (If it's now
-          // available that means this was the last use of the source.)
-          if (isCopy &&
-              TargetRegisterInfo::isPhysicalRegister(SrcCopyReg) &&
-              isPhysRegAvailable(SrcCopyReg)) {
-            DestPhysReg = SrcCopyReg;
-            assignVirtToPhysReg(DestVirtReg, DestPhysReg);
-          } else if (isCopy &&
-              TargetRegisterInfo::isVirtualRegister(SrcCopyReg) &&
-              SrcCopyPhysReg && isPhysRegAvailable(SrcCopyPhysReg) &&
-              MF->getRegInfo().getRegClass(DestVirtReg)->
-                               contains(SrcCopyPhysReg)) {
-            DestPhysReg = SrcCopyPhysReg;
-            assignVirtToPhysReg(DestVirtReg, DestPhysReg);
-          } else
-            DestPhysReg = getReg(MBB, MI, DestVirtReg);
-        }
-        MF->getRegInfo().setPhysRegUsed(DestPhysReg);
-        markVirtRegModified(DestVirtReg);
-        getVirtRegLastUse(DestVirtReg) = std::make_pair((MachineInstr*)0, 0);
-        DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
-                     << " to %reg" << DestVirtReg << "\n");
-        MO.setReg(DestPhysReg);  // Assign the output register
+      // If DestVirtReg already has a value, use it.
+      if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg))) {
+        // If this is a copy try to reuse the input as the output;
+        // that will make the copy go away.
+        // If this is a copy, the source reg is a phys reg, and
+        // that reg is available, use that phys reg for DestPhysReg.
+        // If this is a copy, the source reg is a virtual reg, and
+        // the phys reg that was assigned to that virtual reg is now
+        // available, use that phys reg for DestPhysReg.  (If it's now
+        // available that means this was the last use of the source.)
+        if (isCopy &&
+            TargetRegisterInfo::isPhysicalRegister(SrcCopyReg) &&
+            isPhysRegAvailable(SrcCopyReg)) {
+          DestPhysReg = SrcCopyReg;
+          assignVirtToPhysReg(DestVirtReg, DestPhysReg);
+        } else if (isCopy &&
+            TargetRegisterInfo::isVirtualRegister(SrcCopyReg) &&
+            SrcCopyPhysReg && isPhysRegAvailable(SrcCopyPhysReg) &&
+            MF->getRegInfo().getRegClass(DestVirtReg)->
+                             contains(SrcCopyPhysReg)) {
+          DestPhysReg = SrcCopyPhysReg;
+          assignVirtToPhysReg(DestVirtReg, DestPhysReg);
+        } else
+          DestPhysReg = getReg(MBB, MI, DestVirtReg);
       }
+      MF->getRegInfo().setPhysRegUsed(DestPhysReg);
+      markVirtRegModified(DestVirtReg);
+      getVirtRegLastUse(DestVirtReg) = std::make_pair((MachineInstr*)0, 0);
+      DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
+                   << " to %reg" << DestVirtReg << "\n");
+      MO.setReg(DestPhysReg);  // Assign the output register
     }
 
     // If this instruction defines any registers that are immediately dead,
@@ -1059,21 +1074,20 @@ void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
       } else if (PhysRegsUsed[PhysReg] == -2) {
         // Unallocatable register dead, ignore.
         continue;
-      }
-
-      if (PhysReg) {
-        DEBUG(dbgs()  << "  Register " << TRI->getName(PhysReg)
-                      << " [%reg" << VirtReg
-                      << "] is never used, removing it from live set\n");
-        removePhysReg(PhysReg);
-        for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-             *AliasSet; ++AliasSet) {
-          if (PhysRegsUsed[*AliasSet] != -2) {
-            DEBUG(dbgs()  << "  Register " << TRI->getName(*AliasSet)
-                          << " [%reg" << *AliasSet
-                          << "] is never used, removing it from live set\n");
-            removePhysReg(*AliasSet);
-          }
+      } else if (!PhysReg)
+        continue;
+      
+      DEBUG(dbgs()  << "  Register " << TRI->getName(PhysReg)
+                    << " [%reg" << VirtReg
+                    << "] is never used, removing it from live set\n");
+      removePhysReg(PhysReg);
+      for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
+           *AliasSet; ++AliasSet) {
+        if (PhysRegsUsed[*AliasSet] != -2) {
+          DEBUG(dbgs()  << "  Register " << TRI->getName(*AliasSet)
+                        << " [%reg" << *AliasSet
+                        << "] is never used, removing it from live set\n");
+          removePhysReg(*AliasSet);
         }
       }
     }
@@ -1143,8 +1157,10 @@ bool RALocal::runOnMachineFunction(MachineFunction &Fn) {
   StackSlotForVirtReg.grow(LastVirtReg);
   Virt2PhysRegMap.grow(LastVirtReg);
   Virt2LastUseMap.grow(LastVirtReg);
-  VirtRegModified.resize(LastVirtReg+1-TargetRegisterInfo::FirstVirtualRegister);
-  UsedInMultipleBlocks.resize(LastVirtReg+1-TargetRegisterInfo::FirstVirtualRegister);
+  VirtRegModified.resize(LastVirtReg+1 -
+                         TargetRegisterInfo::FirstVirtualRegister);
+  UsedInMultipleBlocks.resize(LastVirtReg+1 -
+                              TargetRegisterInfo::FirstVirtualRegister);
  
   // Loop over all of the basic blocks, eliminating virtual register references
   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index e532ade5f683..ecc49e2fc1b6 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -248,48 +248,47 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
         unsigned DataLatency = SU->Latency;
         for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
           SUnit *UseSU = UseList[i];
-          if (UseSU != SU) {
-            unsigned LDataLatency = DataLatency;
-            // Optionally add in a special extra latency for nodes that
-            // feed addresses.
-            // TODO: Do this for register aliases too.
-            // TODO: Perhaps we should get rid of
-            // SpecialAddressLatency and just move this into
-            // adjustSchedDependency for the targets that care about
-            // it.
-            if (SpecialAddressLatency != 0 && !UnitLatencies) {
-              MachineInstr *UseMI = UseSU->getInstr();
-              const TargetInstrDesc &UseTID = UseMI->getDesc();
-              int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg);
-              assert(RegUseIndex >= 0 && "UseMI doesn's use register!");
-              if ((UseTID.mayLoad() || UseTID.mayStore()) &&
-                  (unsigned)RegUseIndex < UseTID.getNumOperands() &&
-                  UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass())
-                LDataLatency += SpecialAddressLatency;
-            }
-            // Adjust the dependence latency using operand def/use
-            // information (if any), and then allow the target to
-            // perform its own adjustments.
-            const SDep& dep = SDep(SU, SDep::Data, LDataLatency, Reg);
-            if (!UnitLatencies) {
-              ComputeOperandLatency(SU, UseSU, (SDep &)dep);
-              ST.adjustSchedDependency(SU, UseSU, (SDep &)dep);
-            }
-            UseSU->addPred(dep);
+          if (UseSU == SU)
+            continue;
+          unsigned LDataLatency = DataLatency;
+          // Optionally add in a special extra latency for nodes that
+          // feed addresses.
+          // TODO: Do this for register aliases too.
+          // TODO: Perhaps we should get rid of
+          // SpecialAddressLatency and just move this into
+          // adjustSchedDependency for the targets that care about it.
+          if (SpecialAddressLatency != 0 && !UnitLatencies) {
+            MachineInstr *UseMI = UseSU->getInstr();
+            const TargetInstrDesc &UseTID = UseMI->getDesc();
+            int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg);
+            assert(RegUseIndex >= 0 && "UseMI doesn's use register!");
+            if ((UseTID.mayLoad() || UseTID.mayStore()) &&
+                (unsigned)RegUseIndex < UseTID.getNumOperands() &&
+                UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass())
+              LDataLatency += SpecialAddressLatency;
           }
+          // Adjust the dependence latency using operand def/use
+          // information (if any), and then allow the target to
+          // perform its own adjustments.
+          const SDep& dep = SDep(SU, SDep::Data, LDataLatency, Reg);
+          if (!UnitLatencies) {
+            ComputeOperandLatency(SU, UseSU, (SDep &)dep);
+            ST.adjustSchedDependency(SU, UseSU, (SDep &)dep);
+          }
+          UseSU->addPred(dep);
         }
         for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
           std::vector<SUnit *> &UseList = Uses[*Alias];
           for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
             SUnit *UseSU = UseList[i];
-            if (UseSU != SU) {
-              const SDep& dep = SDep(SU, SDep::Data, DataLatency, *Alias);
-              if (!UnitLatencies) {
-                ComputeOperandLatency(SU, UseSU, (SDep &)dep);
-                ST.adjustSchedDependency(SU, UseSU, (SDep &)dep);
-              }
-              UseSU->addPred(dep);
+            if (UseSU == SU)
+              continue;
+            const SDep& dep = SDep(SU, SDep::Data, DataLatency, *Alias);
+            if (!UnitLatencies) {
+              ComputeOperandLatency(SU, UseSU, (SDep &)dep);
+              ST.adjustSchedDependency(SU, UseSU, (SDep &)dep);
             }
+            UseSU->addPred(dep);
           }
         }
 
@@ -528,7 +527,8 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
   MachineInstr *DefMI = Def->getInstr();
   int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
   if (DefIdx != -1) {
-    int DefCycle = InstrItins.getOperandCycle(DefMI->getDesc().getSchedClass(), DefIdx);
+    int DefCycle = InstrItins.getOperandCycle(DefMI->getDesc().getSchedClass(),
+                                              DefIdx);
     if (DefCycle >= 0) {
       MachineInstr *UseMI = Use->getInstr();
       const unsigned UseClass = UseMI->getDesc().getSchedClass();
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aa283ad894e4..a336e0a01b56 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5022,18 +5022,6 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   SDValue Chain = LD->getChain();
   SDValue Ptr   = LD->getBasePtr();
 
-  // Try to infer better alignment information than the load already has.
-  if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
-    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > LD->getAlignment())
-        return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
-                              LD->getValueType(0),
-                              Chain, Ptr, LD->getSrcValue(),
-                              LD->getSrcValueOffset(), LD->getMemoryVT(),
-                              LD->isVolatile(), LD->isNonTemporal(), Align);
-    }
-  }
-
   // If load is not volatile and there are no uses of the loaded value (and
   // the updated indexed value in case of indexed loads), change uses of the
   // chain value into uses of the chain input (i.e. delete the dead load).
@@ -5099,6 +5087,18 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     }
   }
 
+  // Try to infer better alignment information than the load already has.
+  if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > LD->getAlignment())
+        return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
+                              LD->getValueType(0),
+                              Chain, Ptr, LD->getSrcValue(),
+                              LD->getSrcValueOffset(), LD->getMemoryVT(),
+                              LD->isVolatile(), LD->isNonTemporal(), Align);
+    }
+  }
+
   if (CombinerAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
@@ -5250,17 +5250,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   SDValue Value = ST->getValue();
   SDValue Ptr   = ST->getBasePtr();
 
-  // Try to infer better alignment information than the store already has.
-  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
-    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > ST->getAlignment())
-        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
-                                 Ptr, ST->getSrcValue(),
-                                 ST->getSrcValueOffset(), ST->getMemoryVT(),
-                                 ST->isVolatile(), ST->isNonTemporal(), Align);
-    }
-  }
-
   // If this is a store of a bit convert, store the input value if the
   // resultant store does not need a higher alignment than the original.
   if (Value.getOpcode() == ISD::BIT_CONVERT && !ST->isTruncatingStore() &&
@@ -5351,6 +5340,17 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     }
   }
 
+  // Try to infer better alignment information than the store already has.
+  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > ST->getAlignment())
+        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
+                                 Ptr, ST->getSrcValue(),
+                                 ST->getSrcValueOffset(), ST->getMemoryVT(),
+                                 ST->isVolatile(), ST->isNonTemporal(), Align);
+    }
+  }
+
   if (CombinerAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 3fc30ff90886..e4e9ef405f63 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -340,10 +340,9 @@ bool FastISel::SelectCall(User *I) {
       StaticAllocaMap.find(AI);
     if (SI == StaticAllocaMap.end()) break; // VLAs.
     int FI = SI->second;
-    if (MMI) {
-      if (MDNode *Dbg = DI->getMetadata("dbg"))
-        MMI->setVariableDbgInfo(DI->getVariable(), FI, Dbg);
-    }
+    if (MDNode *Dbg = DI->getDbgMetadata())
+      MMI->setVariableDbgInfo(DI->getVariable(), FI, Dbg);
+    
     // Building the map above is target independent.  Generating DBG_VALUE
     // inline is target dependent; do this now.
     (void)TargetSelectInstruction(cast<Instruction>(I));
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 50f4c325a161..4fb2aa299cbe 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -281,8 +281,17 @@ unsigned FunctionLoweringInfo::CreateRegForValue(const Value *V) {
 GlobalVariable *llvm::ExtractTypeInfo(Value *V) {
   V = V->stripPointerCasts();
   GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
-  assert ((GV || isa<ConstantPointerNull>(V)) &&
-          "TypeInfo must be a global variable or NULL");
+
+  if (GV && GV->getName() == ".llvm.eh.catch.all.value") {
+    assert(GV->hasInitializer() &&
+           "The EH catch-all value must have an initializer");
+    Value *Init = GV->getInitializer();
+    GV = dyn_cast<GlobalVariable>(Init);
+    if (!GV) V = cast<ConstantPointerNull>(Init);
+  }
+
+  assert((GV || isa<ConstantPointerNull>(V)) &&
+         "TypeInfo must be a global variable or NULL");
   return GV;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index fda094d393bf..28ba343e7fe6 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -264,7 +264,8 @@ void
 InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
                                  unsigned IIOpNum,
                                  const TargetInstrDesc *II,
-                                 DenseMap<SDValue, unsigned> &VRBaseMap) {
+                                 DenseMap<SDValue, unsigned> &VRBaseMap,
+                                 bool IsDebug) {
   assert(Op.getValueType() != MVT::Other &&
          Op.getValueType() != MVT::Flag &&
          "Chain and flag operands should occur at end of operand list!");
@@ -295,7 +296,11 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
     }
   }
 
-  MI->addOperand(MachineOperand::CreateReg(VReg, isOptDef));
+  MI->addOperand(MachineOperand::CreateReg(VReg, isOptDef,
+                                           false/*isImp*/, false/*isKill*/,
+                                           false/*isDead*/, false/*isUndef*/,
+                                           false/*isEarlyClobber*/,
+                                           0/*SubReg*/, IsDebug));
 }
 
 /// AddOperand - Add the specified operand to the specified machine instr.  II
@@ -305,9 +310,10 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
 void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
                               unsigned IIOpNum,
                               const TargetInstrDesc *II,
-                              DenseMap<SDValue, unsigned> &VRBaseMap) {
+                              DenseMap<SDValue, unsigned> &VRBaseMap,
+                              bool IsDebug) {
   if (Op.isMachineOpcode()) {
-    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap);
+    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap, IsDebug);
   } else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateImm(C->getSExtValue()));
   } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
@@ -356,7 +362,7 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
     assert(Op.getValueType() != MVT::Other &&
            Op.getValueType() != MVT::Flag &&
            "Chain and flag operands should occur at end of operand list!");
-    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap);
+    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap, IsDebug);
   }
 }
 
@@ -498,164 +504,156 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
   assert(isNew && "Node emitted out of order - early");
 }
 
-/// EmitDbgValue - Generate any debug info that refers to this Node.  Constant
-/// dbg_value is not handled here.
-void
-InstrEmitter::EmitDbgValue(SDNode *Node,
-                           DenseMap<SDValue, unsigned> &VRBaseMap,
-                           SDDbgValue *sd) {
-  if (!Node->getHasDebugValue())
-    return;
-  if (!sd)
-    return;
-  assert(sd->getKind() == SDDbgValue::SDNODE);
-  unsigned VReg = getVR(SDValue(sd->getSDNode(), sd->getResNo()), VRBaseMap);
-  const TargetInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
-  DebugLoc DL = sd->getDebugLoc();
-  MachineInstr *MI;
-  if (VReg) {
-    MI = BuildMI(*MF, DL, II).addReg(VReg, RegState::Debug).
-                              addImm(sd->getOffset()).
-                              addMetadata(sd->getMDPtr());
-  } else {
-    // Insert an Undef so we can see what we dropped.
-    MI = BuildMI(*MF, DL, II).addReg(0U).addImm(sd->getOffset()).
-                                    addMetadata(sd->getMDPtr());
-  }
-  MBB->insert(InsertPos, MI);
-}
-
-/// EmitDbgValue - Generate debug info that does not refer to a SDNode.
-void
-InstrEmitter::EmitDbgValue(SDDbgValue *sd,
+/// EmitDbgValue - Generate machine instruction for a dbg_value node.
+///
+MachineInstr *InstrEmitter::EmitDbgValue(SDDbgValue *SD,
+                                         MachineBasicBlock *InsertBB,
+                                         DenseMap<SDValue, unsigned> &VRBaseMap,
                          DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) {
-  if (!sd)
-    return;
+  uint64_t Offset = SD->getOffset();
+  MDNode* MDPtr = SD->getMDPtr();
+  DebugLoc DL = SD->getDebugLoc();
+
   const TargetInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
-  uint64_t Offset = sd->getOffset();
-  MDNode* mdPtr = sd->getMDPtr();
-  SDDbgValue::DbgValueKind kind = sd->getKind();
-  DebugLoc DL = sd->getDebugLoc();
-  MachineInstr* MI;
-  if (kind == SDDbgValue::CONST) {
-    Value *V = sd->getConst();
+  MachineInstrBuilder MIB = BuildMI(*MF, DL, II);
+  if (SD->getKind() == SDDbgValue::SDNODE) {
+    AddOperand(&*MIB, SDValue(SD->getSDNode(), SD->getResNo()),
+               (*MIB).getNumOperands(), &II, VRBaseMap, true /*IsDebug*/);
+  } else if (SD->getKind() == SDDbgValue::CONST) {
+    Value *V = SD->getConst();
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      MI = BuildMI(*MF, DL, II).addImm(CI->getZExtValue()).
-                                     addImm(Offset).addMetadata(mdPtr);
+      MIB.addImm(CI->getSExtValue());
     } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
-      MI = BuildMI(*MF, DL, II).addFPImm(CF).
-                                     addImm(Offset).addMetadata(mdPtr);
+      MIB.addFPImm(CF);
     } else {
       // Could be an Undef.  In any case insert an Undef so we can see what we
       // dropped.
-      MI = BuildMI(*MF, DL, II).addReg(0U).
-                                       addImm(Offset).addMetadata(mdPtr);
+      MIB.addReg(0U);
     }
-  } else if (kind == SDDbgValue::FRAMEIX) {
-    unsigned FrameIx = sd->getFrameIx();
+  } else if (SD->getKind() == SDDbgValue::FRAMEIX) {
+    unsigned FrameIx = SD->getFrameIx();
     // Stack address; this needs to be lowered in target-dependent fashion.
     // FIXME test that the target supports this somehow; if not emit Undef.
     // Create a pseudo for EmitInstrWithCustomInserter's consumption.
-    MI = BuildMI(*MF, DL, II).addImm(FrameIx).
-                                   addImm(Offset).addMetadata(mdPtr);
+    MIB.addImm(FrameIx).addImm(Offset).addMetadata(MDPtr);
+    abort();
+    TLI->EmitInstrWithCustomInserter(&*MIB, InsertBB, EM);
+    return 0;
+  } else {
+    // Insert an Undef so we can see what we dropped.
+    MIB.addReg(0U);
+  }
+
+  MIB.addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+/// EmitMachineNode - Generate machine code for a target-specific node and
+/// needed dependencies.
+///
+void InstrEmitter::
+EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap,
+                DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) {
+  unsigned Opc = Node->getMachineOpcode();
+  
+  // Handle subreg insert/extract specially
+  if (Opc == TargetOpcode::EXTRACT_SUBREG || 
+      Opc == TargetOpcode::INSERT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG) {
+    EmitSubregNode(Node, VRBaseMap);
+    return;
+  }
+
+  // Handle COPY_TO_REGCLASS specially.
+  if (Opc == TargetOpcode::COPY_TO_REGCLASS) {
+    EmitCopyToRegClassNode(Node, VRBaseMap);
+    return;
+  }
+
+  if (Opc == TargetOpcode::IMPLICIT_DEF)
+    // We want a unique VR for each IMPLICIT_DEF use.
+    return;
+  
+  const TargetInstrDesc &II = TII->get(Opc);
+  unsigned NumResults = CountResults(Node);
+  unsigned NodeOperands = CountOperands(Node);
+  bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
+#ifndef NDEBUG
+  unsigned NumMIOperands = NodeOperands + NumResults;
+  if (II.isVariadic())
+    assert(NumMIOperands >= II.getNumOperands() &&
+           "Too few operands for a variadic node!");
+  else
+    assert(NumMIOperands >= II.getNumOperands() &&
+           NumMIOperands <= II.getNumOperands()+II.getNumImplicitDefs() &&
+           "#operands for dag node doesn't match .td file!");
+#endif
+
+  // Create the new machine instruction.
+  MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), II);
+  
+  // Add result register values for things that are defined by this
+  // instruction.
+  if (NumResults)
+    CreateVirtualRegisters(Node, MI, II, IsClone, IsCloned, VRBaseMap);
+  
+  // Emit all of the actual operands of this instruction, adding them to the
+  // instruction as appropriate.
+  bool HasOptPRefs = II.getNumDefs() > NumResults;
+  assert((!HasOptPRefs || !HasPhysRegOuts) &&
+         "Unable to cope with optional defs and phys regs defs!");
+  unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0;
+  for (unsigned i = NumSkip; i != NodeOperands; ++i)
+    AddOperand(MI, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II,
+               VRBaseMap);
+
+  // Transfer all of the memory reference descriptions of this instruction.
+  MI->setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
+                 cast<MachineSDNode>(Node)->memoperands_end());
+
+  if (II.usesCustomInsertionHook()) {
+    // Insert this instruction into the basic block using a target
+    // specific inserter which may returns a new basic block.
     MBB = TLI->EmitInstrWithCustomInserter(MI, MBB, EM);
     InsertPos = MBB->end();
     return;
-  } else {
-    // Insert an Undef so we can see what we dropped.
-    MI = BuildMI(*MF, DL, II).addReg(0U).
-                                     addImm(Offset).addMetadata(mdPtr);
   }
+  
   MBB->insert(InsertPos, MI);
+
+  // Additional results must be an physical register def.
+  if (HasPhysRegOuts) {
+    for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
+      unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
+      if (Node->hasAnyUseOfValue(i))
+        EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap);
+      // If there are no uses, mark the register as dead now, so that
+      // MachineLICM/Sink can see that it's dead. Don't do this if the
+      // node has a Flag value, for the benefit of targets still using
+      // Flag for values in physregs.
+      else if (Node->getValueType(Node->getNumValues()-1) != MVT::Flag)
+        MI->addRegisterDead(Reg, TRI);
+    }
+  }
+  
+  // If the instruction has implicit defs and the node doesn't, mark the
+  // implicit def as dead.  If the node has any flag outputs, we don't do this
+  // because we don't know what implicit defs are being used by flagged nodes.
+  if (Node->getValueType(Node->getNumValues()-1) != MVT::Flag)
+    if (const unsigned *IDList = II.getImplicitDefs()) {
+      for (unsigned i = NumResults, e = II.getNumDefs()+II.getNumImplicitDefs();
+           i != e; ++i)
+        MI->addRegisterDead(IDList[i-II.getNumDefs()], TRI);
+    }
+  return;
 }
 
-/// EmitNode - Generate machine code for a node and needed dependencies.
-///
-void InstrEmitter::EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
-                            DenseMap<SDValue, unsigned> &VRBaseMap,
-                         DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) {
-  // If machine instruction
-  if (Node->isMachineOpcode()) {
-    unsigned Opc = Node->getMachineOpcode();
-    
-    // Handle subreg insert/extract specially
-    if (Opc == TargetOpcode::EXTRACT_SUBREG || 
-        Opc == TargetOpcode::INSERT_SUBREG ||
-        Opc == TargetOpcode::SUBREG_TO_REG) {
-      EmitSubregNode(Node, VRBaseMap);
-      return;
-    }
-
-    // Handle COPY_TO_REGCLASS specially.
-    if (Opc == TargetOpcode::COPY_TO_REGCLASS) {
-      EmitCopyToRegClassNode(Node, VRBaseMap);
-      return;
-    }
-
-    if (Opc == TargetOpcode::IMPLICIT_DEF)
-      // We want a unique VR for each IMPLICIT_DEF use.
-      return;
-    
-    const TargetInstrDesc &II = TII->get(Opc);
-    unsigned NumResults = CountResults(Node);
-    unsigned NodeOperands = CountOperands(Node);
-    bool HasPhysRegOuts = (NumResults > II.getNumDefs()) &&
-                          II.getImplicitDefs() != 0;
-#ifndef NDEBUG
-    unsigned NumMIOperands = NodeOperands + NumResults;
-    assert((II.getNumOperands() == NumMIOperands ||
-            HasPhysRegOuts || II.isVariadic()) &&
-           "#operands for dag node doesn't match .td file!"); 
-#endif
-
-    // Create the new machine instruction.
-    MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), II);
-    
-    // Add result register values for things that are defined by this
-    // instruction.
-    if (NumResults)
-      CreateVirtualRegisters(Node, MI, II, IsClone, IsCloned, VRBaseMap);
-    
-    // Emit all of the actual operands of this instruction, adding them to the
-    // instruction as appropriate.
-    bool HasOptPRefs = II.getNumDefs() > NumResults;
-    assert((!HasOptPRefs || !HasPhysRegOuts) &&
-           "Unable to cope with optional defs and phys regs defs!");
-    unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0;
-    for (unsigned i = NumSkip; i != NodeOperands; ++i)
-      AddOperand(MI, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II,
-                 VRBaseMap);
-
-    // Transfer all of the memory reference descriptions of this instruction.
-    MI->setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
-                   cast<MachineSDNode>(Node)->memoperands_end());
-
-    if (II.usesCustomInsertionHook()) {
-      // Insert this instruction into the basic block using a target
-      // specific inserter which may returns a new basic block.
-      MBB = TLI->EmitInstrWithCustomInserter(MI, MBB, EM);
-      InsertPos = MBB->end();
-    } else {
-      MBB->insert(InsertPos, MI);
-    }
-
-    // Additional results must be an physical register def.
-    if (HasPhysRegOuts) {
-      for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
-        unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
-        if (Node->hasAnyUseOfValue(i))
-          EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap);
-        // If there are no uses, mark the register as dead now, so that
-        // MachineLICM/Sink can see that it's dead. Don't do this if the
-        // node has a Flag value, for the benefit of targets still using
-        // Flag for values in physregs.
-        else if (Node->getValueType(Node->getNumValues()-1) != MVT::Flag)
-          MI->addRegisterDead(Reg, TRI);
-      }
-    }
-    return;
-  }
-
+/// EmitSpecialNode - Generate machine code for a target-independent node and
+/// needed dependencies.
+void InstrEmitter::
+EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
   switch (Node->getOpcode()) {
   default:
 #ifndef NDEBUG
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index eefcd73e6e94..baabb7554b60 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -64,7 +64,8 @@ class InstrEmitter {
   void AddRegisterOperand(MachineInstr *MI, SDValue Op,
                           unsigned IIOpNum,
                           const TargetInstrDesc *II,
-                          DenseMap<SDValue, unsigned> &VRBaseMap);
+                          DenseMap<SDValue, unsigned> &VRBaseMap,
+                          bool IsDebug = false);
 
   /// AddOperand - Add the specified operand to the specified machine instr.  II
   /// specifies the instruction information for the node, and IIOpNum is the
@@ -73,7 +74,8 @@ class InstrEmitter {
   void AddOperand(MachineInstr *MI, SDValue Op,
                   unsigned IIOpNum,
                   const TargetInstrDesc *II,
-                  DenseMap<SDValue, unsigned> &VRBaseMap);
+                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  bool IsDebug = false);
 
   /// EmitSubregNode - Generate machine code for subreg nodes.
   ///
@@ -98,22 +100,23 @@ public:
   /// MachineInstr.
   static unsigned CountOperands(SDNode *Node);
 
-  /// EmitDbgValue - Generate any debug info that refers to this Node.  Constant
-  /// dbg_value is not handled here.
-  void EmitDbgValue(SDNode *Node,
-                    DenseMap<SDValue, unsigned> &VRBaseMap,
-                    SDDbgValue* sd);
-
-
-  /// EmitDbgValue - Generate a constant DBG_VALUE.  No node is involved.
-  void EmitDbgValue(SDDbgValue* sd,
-                DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM);
+  /// EmitDbgValue - Generate machine instruction for a dbg_value node.
+  ///
+  MachineInstr *EmitDbgValue(SDDbgValue *SD,
+                          MachineBasicBlock *InsertBB,
+                          DenseMap<SDValue, unsigned> &VRBaseMap,
+                          DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM);
 
   /// EmitNode - Generate machine code for a node and needed dependencies.
   ///
   void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
                 DenseMap<SDValue, unsigned> &VRBaseMap,
-                DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM);
+                DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) {
+    if (Node->isMachineOpcode())
+      EmitMachineNode(Node, IsClone, IsCloned, VRBaseMap, EM);
+    else
+      EmitSpecialNode(Node, IsClone, IsCloned, VRBaseMap);
+  }
 
   /// getBlock - Return the current basic block.
   MachineBasicBlock *getBlock() { return MBB; }
@@ -124,6 +127,13 @@ public:
   /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
   /// at the given position in the given block.
   InstrEmitter(MachineBasicBlock *mbb, MachineBasicBlock::iterator insertpos);
+  
+private:
+  void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap,
+                       DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM);
+  void EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
 };
 
 }
diff --git a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index dbbd753c5bb4..7638ea2ae162 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -47,10 +47,12 @@ private:
   uint64_t Offset;
   DebugLoc DL;
   unsigned Order;
+  bool Invalid;
 public:
   // Constructor for non-constants.
   SDDbgValue(MDNode *mdP, SDNode *N, unsigned R, uint64_t off, DebugLoc dl,
-             unsigned O) : mdPtr(mdP), Offset(off), DL(dl), Order(O) {
+             unsigned O) : mdPtr(mdP), Offset(off), DL(dl), Order(O),
+                           Invalid(false) {
     kind = SDNODE;
     u.s.Node = N;
     u.s.ResNo = R;
@@ -58,14 +60,14 @@ public:
 
   // Constructor for constants.
   SDDbgValue(MDNode *mdP, Value *C, uint64_t off, DebugLoc dl, unsigned O) : 
-    mdPtr(mdP), Offset(off), DL(dl), Order(O) {
+    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
     kind = CONST;
     u.Const = C;
   }
 
   // Constructor for frame indices.
   SDDbgValue(MDNode *mdP, unsigned FI, uint64_t off, DebugLoc dl, unsigned O) : 
-    mdPtr(mdP), Offset(off), DL(dl), Order(O) {
+    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
     kind = FRAMEIX;
     u.FrameIx = FI;
   }
@@ -97,6 +99,12 @@ public:
   // Returns the SDNodeOrder.  This is the order of the preceding node in the
   // input.
   unsigned getOrder() { return Order; }
+
+  // setIsInvalidated / isInvalidated - Setter / getter of the "Invalidated"
+  // property. A SDDbgValue is invalid if the SDNode that produces the value is
+  // deleted.
+  void setIsInvalidated() { Invalid = true; }
+  bool isInvalidated() { return Invalid; }
 };
 
 } // end llvm namespace
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index c13565a35a45..e7ab2f003963 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Target/TargetSubtarget.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
@@ -407,19 +408,65 @@ void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
   }
 }
 
+namespace {
+  struct OrderSorter {
+    bool operator()(const std::pair<unsigned, MachineInstr*> &A,
+                    const std::pair<unsigned, MachineInstr*> &B) {
+      return A.first < B.first;
+    }
+  };
+}
+
+// ProcessSourceNode - Process nodes with source order numbers. These are added
+// to a vector which EmitSchedule use to determine how to insert dbg_value
+// instructions in the right order.
+static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
+                           InstrEmitter &Emitter,
+                           DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM,
+                           DenseMap<SDValue, unsigned> &VRBaseMap,
+                    SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders,
+                           SmallSet<unsigned, 8> &Seen) {
+  unsigned Order = DAG->GetOrdering(N);
+  if (!Order || !Seen.insert(Order))
+    return;
+
+  MachineBasicBlock *BB = Emitter.getBlock();
+  if (BB->empty() || BB->back().isPHI()) {
+    // Did not insert any instruction.
+    Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
+    return;
+  }
+
+  Orders.push_back(std::make_pair(Order, &BB->back()));
+  if (!N->getHasDebugValue())
+    return;
+  // Opportunistically insert immediate dbg_value uses, i.e. those with source
+  // order number right after the N.
+  MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
+  SmallVector<SDDbgValue*,2> &DVs = DAG->GetDbgValues(N);
+  for (unsigned i = 0, e = DVs.size(); i != e; ++i) {
+    if (DVs[i]->isInvalidated())
+      continue;
+    unsigned DVOrder = DVs[i]->getOrder();
+    if (DVOrder == ++Order) {
+      MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], BB, VRBaseMap, EM);
+      Orders.push_back(std::make_pair(DVOrder, DbgMI));
+      BB->insert(InsertPos, DbgMI);
+      DVs[i]->setIsInvalidated();
+    }
+  }
+}
+
+
 /// EmitSchedule - Emit the machine code in scheduled order.
 MachineBasicBlock *ScheduleDAGSDNodes::
 EmitSchedule(DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) {
   InstrEmitter Emitter(BB, InsertPos);
   DenseMap<SDValue, unsigned> VRBaseMap;
   DenseMap<SUnit*, unsigned> CopyVRBaseMap;
-
-  // For now, any constant debug info nodes go at the beginning.
-  for (SDDbgInfo::ConstDbgIterator I = DAG->DbgConstBegin(),
-       E = DAG->DbgConstEnd(); I!=E; I++) {
-    Emitter.EmitDbgValue(*I, EM);
-    delete *I;
-  }
+  SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
+  SmallSet<unsigned, 8> Seen;
+  bool HasDbg = DAG->hasDebugValues();
 
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
     SUnit *SU = Sequence[i];
@@ -442,22 +489,80 @@ EmitSchedule(DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) {
          N = N->getFlaggedNode())
       FlaggedNodes.push_back(N);
     while (!FlaggedNodes.empty()) {
+      SDNode *N = FlaggedNodes.back();
       Emitter.EmitNode(FlaggedNodes.back(), SU->OrigNode != SU, SU->isCloned,
                        VRBaseMap, EM);
-      if (FlaggedNodes.back()->getHasDebugValue())
-        if (SDDbgValue *sd = DAG->GetDbgInfo(FlaggedNodes.back())) {
-          Emitter.EmitDbgValue(FlaggedNodes.back(), VRBaseMap, sd);
-          delete sd;
-        }
+      // Remember the the source order of the inserted instruction.
+      if (HasDbg)
+        ProcessSourceNode(N, DAG, Emitter, EM, VRBaseMap, Orders, Seen);
       FlaggedNodes.pop_back();
     }
     Emitter.EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned,
                      VRBaseMap, EM);
-    if (SU->getNode()->getHasDebugValue())
-      if (SDDbgValue *sd = DAG->GetDbgInfo(SU->getNode())) {
-        Emitter.EmitDbgValue(SU->getNode(), VRBaseMap, sd);
-        delete sd;
+    // Remember the the source order of the inserted instruction.
+    if (HasDbg)
+      ProcessSourceNode(SU->getNode(), DAG, Emitter, EM, VRBaseMap, Orders,
+                        Seen);
+  }
+
+  // Insert all the dbg_value which have not already been inserted in source
+  // order sequence.
+  if (HasDbg) {
+    MachineBasicBlock::iterator BBBegin = BB->empty() ? BB->end() : BB->begin();
+    while (BBBegin != BB->end() && BBBegin->isPHI())
+      ++BBBegin;
+
+    // Sort the source order instructions and use the order to insert debug
+    // values.
+    std::sort(Orders.begin(), Orders.end(), OrderSorter());
+
+    SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
+    SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
+    // Now emit the rest according to source order.
+    unsigned LastOrder = 0;
+    MachineInstr *LastMI = 0;
+    for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
+      unsigned Order = Orders[i].first;
+      MachineInstr *MI = Orders[i].second;
+      // Insert all SDDbgValue's whose order(s) are before "Order".
+      if (!MI)
+        continue;
+      MachineBasicBlock *MIBB = MI->getParent();
+#ifndef NDEBUG
+      unsigned LastDIOrder = 0;
+#endif
+      for (; DI != DE &&
+             (*DI)->getOrder() >= LastOrder && (*DI)->getOrder() < Order; ++DI) {
+#ifndef NDEBUG
+        assert((*DI)->getOrder() >= LastDIOrder &&
+               "SDDbgValue nodes must be in source order!");
+        LastDIOrder = (*DI)->getOrder();
+#endif
+        if ((*DI)->isInvalidated())
+          continue;
+        MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, MIBB, VRBaseMap, EM);
+        if (!LastOrder)
+          // Insert to start of the BB (after PHIs).
+          BB->insert(BBBegin, DbgMI);
+        else {
+          MachineBasicBlock::iterator Pos = MI;
+          MIBB->insert(llvm::next(Pos), DbgMI);
+        }
       }
+      LastOrder = Order;
+      LastMI = MI;
+    }
+    // Add trailing DbgValue's before the terminator. FIXME: May want to add
+    // some of them before one or more conditional branches?
+    while (DI != DE) {
+      MachineBasicBlock *InsertBB = Emitter.getBlock();
+      MachineBasicBlock::iterator Pos= Emitter.getBlock()->getFirstTerminator();
+      if (!(*DI)->isInvalidated()) {
+        MachineInstr *DbgMI= Emitter.EmitDbgValue(*DI, InsertBB, VRBaseMap, EM);
+        InsertBB->insert(Pos, DbgMI);
+      }
+      ++DI;
+    }
   }
 
   BB = Emitter.getBlock();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ed9146dc08d9..0ba65ab7f96d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -598,8 +598,10 @@ void SelectionDAG::DeallocateNode(SDNode *N) {
   // Remove the ordering of this node.
   Ordering->remove(N);
 
-  // And its entry in the debug info table, if any.
-  DbgInfo->remove(N);
+  // If any of the SDDbgValue nodes refer to this SDNode, invalidate them.
+  SmallVector<SDDbgValue*, 2> &DbgVals = DbgInfo->getSDDbgValues(N);
+  for (unsigned i = 0, e = DbgVals.size(); i != e; ++i)
+    DbgVals[i]->setIsInvalidated();
 }
 
 /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
@@ -811,6 +813,7 @@ void SelectionDAG::init(MachineFunction &mf, MachineModuleInfo *mmi,
 SelectionDAG::~SelectionDAG() {
   allnodes_clear();
   delete Ordering;
+  DbgInfo->clear();
   delete DbgInfo;
 }
 
@@ -839,6 +842,7 @@ void SelectionDAG::clear() {
   Root = getEntryNode();
   delete Ordering;
   Ordering = new SDNodeOrdering();
+  DbgInfo->clear();
   delete DbgInfo;
   DbgInfo = new SDDbgInfo();
 }
@@ -3128,11 +3132,17 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
   if (Str.empty()) {
     if (VT.isInteger())
       return DAG.getConstant(0, VT);
-    unsigned NumElts = VT.getVectorNumElements();
-    MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
-                       DAG.getConstant(0,
-                       EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts)));
+    else if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
+             VT.getSimpleVT().SimpleTy == MVT::f64)
+      return DAG.getConstantFP(0.0, VT);
+    else if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                         DAG.getConstant(0, EVT::getVectorVT(*DAG.getContext(),
+                                                             EltVT, NumElts)));
+    } else
+      llvm_unreachable("Expected type!");
   }
 
   assert(!VT.isVector() && "Can't handle vector type here!");
@@ -3180,51 +3190,33 @@ static bool isMemSrcFromString(SDValue Src, std::string &Str) {
   return false;
 }
 
-/// MeetsMaxMemopRequirement - Determines if the number of memory ops required
-/// to replace the memset / memcpy is below the threshold. It also returns the
-/// types of the sequence of memory ops to perform memset / memcpy.
-static
-bool MeetsMaxMemopRequirement(std::vector<EVT> &MemOps,
-                              SDValue Dst, SDValue Src,
-                              unsigned Limit, uint64_t Size, unsigned &Align,
-                              std::string &Str, bool &isSrcStr,
-                              SelectionDAG &DAG,
-                              const TargetLowering &TLI) {
-  isSrcStr = isMemSrcFromString(Src, Str);
-  bool isSrcConst = isa<ConstantSDNode>(Src);
-  EVT VT = TLI.getOptimalMemOpType(Size, Align, isSrcConst, isSrcStr, DAG);
-  bool AllowUnalign = TLI.allowsUnalignedMemoryAccesses(VT);
-  if (VT != MVT::Other) {
-    const Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
-    // If source is a string constant, this will require an unaligned load.
-    if (NewAlign > Align && (isSrcConst || AllowUnalign)) {
-      if (Dst.getOpcode() != ISD::FrameIndex) {
-        // Can't change destination alignment. It requires a unaligned store.
-        if (AllowUnalign)
-          VT = MVT::Other;
-      } else {
-        int FI = cast<FrameIndexSDNode>(Dst)->getIndex();
-        MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-        if (MFI->isFixedObjectIndex(FI)) {
-          // Can't change destination alignment. It requires a unaligned store.
-          if (AllowUnalign)
-            VT = MVT::Other;
-        } else {
-          // Give the stack frame object a larger alignment if needed.
-          if (MFI->getObjectAlignment(FI) < NewAlign)
-            MFI->setObjectAlignment(FI, NewAlign);
-          Align = NewAlign;
-        }
-      }
-    }
-  }
+/// FindOptimalMemOpLowering - Determines the optimial series memory ops
+/// to replace the memset / memcpy. Return true if the number of memory ops
+/// is below the threshold. It returns the types of the sequence of
+/// memory ops to perform memset / memcpy by reference.
+static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
+                                     unsigned Limit, uint64_t Size,
+                                     unsigned DstAlign, unsigned SrcAlign,
+                                     bool SafeToUseFP,
+                                     SelectionDAG &DAG,
+                                     const TargetLowering &TLI) {
+  assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
+         "Expecting memcpy / memset source to meet alignment requirement!");
+  // If 'SrcAlign' is zero, that means the memory operation does not need load
+  // the value, i.e. memset or memcpy from constant string. Otherwise, it's
+  // the inferred alignment of the source. 'DstAlign', on the other hand, is the
+  // specified alignment of the memory operation. If it is zero, that means
+  // it's possible to change the alignment of the destination.
+  EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign, SafeToUseFP, DAG);
 
   if (VT == MVT::Other) {
-    if (TLI.allowsUnalignedMemoryAccesses(MVT::i64)) {
+    VT = TLI.getPointerTy();
+    const Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+    if (DstAlign >= TLI.getTargetData()->getABITypeAlignment(Ty) ||
+        TLI.allowsUnalignedMemoryAccesses(VT)) {
       VT = MVT::i64;
     } else {
-      switch (Align & 7) {
+      switch (DstAlign & 7) {
       case 0:  VT = MVT::i64; break;
       case 4:  VT = MVT::i32; break;
       case 2:  VT = MVT::i16; break;
@@ -3246,7 +3238,7 @@ bool MeetsMaxMemopRequirement(std::vector<EVT> &MemOps,
     unsigned VTSize = VT.getSizeInBits() / 8;
     while (VTSize > Size) {
       // For now, only use non-vector load / store's for the left-over pieces.
-      if (VT.isVector()) {
+      if (VT.isVector() || VT.isFloatingPoint()) {
         VT = MVT::i64;
         while (!TLI.isTypeLegal(VT))
           VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
@@ -3269,11 +3261,11 @@ bool MeetsMaxMemopRequirement(std::vector<EVT> &MemOps,
 }
 
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
-                                         SDValue Chain, SDValue Dst,
-                                         SDValue Src, uint64_t Size,
-                                         unsigned Align, bool AlwaysInline,
-                                         const Value *DstSV, uint64_t DstSVOff,
-                                         const Value *SrcSV, uint64_t SrcSVOff){
+                                       SDValue Chain, SDValue Dst,
+                                       SDValue Src, uint64_t Size,
+                                       unsigned Align, bool AlwaysInline,
+                                       const Value *DstSV, uint64_t DstSVOff,
+                                       const Value *SrcSV, uint64_t SrcSVOff) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Expand memcpy to a series of load and store ops if the size operand falls
@@ -3282,15 +3274,33 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   uint64_t Limit = -1ULL;
   if (!AlwaysInline)
     Limit = TLI.getMaxStoresPerMemcpy();
-  unsigned DstAlign = Align;  // Destination alignment can change.
+  bool DstAlignCanChange = false;
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
   std::string Str;
-  bool CopyFromStr;
-  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
-                                Str, CopyFromStr, DAG, TLI))
+  bool CopyFromStr = isMemSrcFromString(Src, Str);
+  bool isZeroStr = CopyFromStr && Str.empty();
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align),
+                                (isZeroStr ? 0 : SrcAlign), true, DAG, TLI))
     return SDValue();
 
+  if (DstAlignCanChange) {
+    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
 
-  bool isZeroStr = CopyFromStr && Str.empty();
   SmallVector<SDValue, 8> OutChains;
   unsigned NumMemOps = MemOps.size();
   uint64_t SrcOff = 0, DstOff = 0;
@@ -3299,16 +3309,17 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value, Store;
 
-    if (CopyFromStr && (isZeroStr || !VT.isVector())) {
+    if (CopyFromStr &&
+        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
       // It's unlikely a store of a vector immediate can be done in a single
       // instruction. It would require a load from a constantpool first.
-      // We also handle store a vector with all zero's.
+      // We only handle zero vectors here.
       // FIXME: Handle other cases where store of vector immediate is done in
       // a single instruction.
       Value = getMemsetStringVal(VT, dl, DAG, TLI, Str, SrcOff);
       Store = DAG.getStore(Chain, dl, Value,
                            getMemBasePlusOffset(Dst, DstOff, DAG),
-                           DstSV, DstSVOff + DstOff, false, false, DstAlign);
+                           DstSV, DstSVOff + DstOff, false, false, Align);
     } else {
       // The type might not be legal for the target.  This should only happen
       // if the type is smaller than a legal type, as on PPC, so the right
@@ -3319,11 +3330,12 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
       assert(NVT.bitsGE(VT));
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
                              getMemBasePlusOffset(Src, SrcOff, DAG),
-                             SrcSV, SrcSVOff + SrcOff, VT, false, false, Align);
+                             SrcSV, SrcSVOff + SrcOff, VT, false, false,
+                             MinAlign(SrcAlign, SrcOff));
       Store = DAG.getTruncStore(Chain, dl, Value,
                                 getMemBasePlusOffset(Dst, DstOff, DAG),
                                 DstSV, DstSVOff + DstOff, VT, false, false,
-                                DstAlign);
+                                Align);
     }
     OutChains.push_back(Store);
     SrcOff += VTSize;
@@ -3335,11 +3347,11 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
 }
 
 static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
-                                          SDValue Chain, SDValue Dst,
-                                          SDValue Src, uint64_t Size,
-                                          unsigned Align, bool AlwaysInline,
-                                          const Value *DstSV, uint64_t DstSVOff,
-                                          const Value *SrcSV, uint64_t SrcSVOff){
+                                        SDValue Chain, SDValue Dst,
+                                        SDValue Src, uint64_t Size,
+                                        unsigned Align,bool AlwaysInline,
+                                        const Value *DstSV, uint64_t DstSVOff,
+                                        const Value *SrcSV, uint64_t SrcSVOff) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Expand memmove to a series of load and store ops if the size operand falls
@@ -3348,15 +3360,32 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   uint64_t Limit = -1ULL;
   if (!AlwaysInline)
     Limit = TLI.getMaxStoresPerMemmove();
-  unsigned DstAlign = Align;  // Destination alignment can change.
-  std::string Str;
-  bool CopyFromStr;
-  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
-                                Str, CopyFromStr, DAG, TLI))
+  bool DstAlignCanChange = false;
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
+
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align),
+                                SrcAlign, true, DAG, TLI))
     return SDValue();
 
-  uint64_t SrcOff = 0, DstOff = 0;
+  if (DstAlignCanChange) {
+    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
 
+  uint64_t SrcOff = 0, DstOff = 0;
   SmallVector<SDValue, 8> LoadValues;
   SmallVector<SDValue, 8> LoadChains;
   SmallVector<SDValue, 8> OutChains;
@@ -3368,7 +3397,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
 
     Value = DAG.getLoad(VT, dl, Chain,
                         getMemBasePlusOffset(Src, SrcOff, DAG),
-                        SrcSV, SrcSVOff + SrcOff, false, false, Align);
+                        SrcSV, SrcSVOff + SrcOff, false, false, SrcAlign);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
@@ -3383,7 +3412,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
 
     Store = DAG.getStore(Chain, dl, LoadValues[i],
                          getMemBasePlusOffset(Dst, DstOff, DAG),
-                         DstSV, DstSVOff + DstOff, false, false, DstAlign);
+                         DstSV, DstSVOff + DstOff, false, false, Align);
     OutChains.push_back(Store);
     DstOff += VTSize;
   }
@@ -3393,24 +3422,40 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
 }
 
 static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
-                                 SDValue Chain, SDValue Dst,
-                                 SDValue Src, uint64_t Size,
-                                 unsigned Align,
-                                 const Value *DstSV, uint64_t DstSVOff) {
+                               SDValue Chain, SDValue Dst,
+                               SDValue Src, uint64_t Size,
+                               unsigned Align,
+                               const Value *DstSV, uint64_t DstSVOff) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Expand memset to a series of load/store ops if the size operand
   // falls below a certain threshold.
   std::vector<EVT> MemOps;
-  std::string Str;
-  bool CopyFromStr;
-  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, TLI.getMaxStoresPerMemset(),
-                                Size, Align, Str, CopyFromStr, DAG, TLI))
+  bool DstAlignCanChange = false;
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  bool IsZero = isa<ConstantSDNode>(Src) &&
+    cast<ConstantSDNode>(Src)->isNullValue();
+  if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(),
+                                Size, (DstAlignCanChange ? 0 : Align), 0,
+                                IsZero, DAG, TLI))
     return SDValue();
 
+  if (DstAlignCanChange) {
+    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
   SmallVector<SDValue, 8> OutChains;
   uint64_t DstOff = 0;
-
   unsigned NumMemOps = MemOps.size();
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
@@ -3441,10 +3486,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
     if (ConstantSize->isNullValue())
       return Chain;
 
-    SDValue Result =
-      getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
-                              ConstantSize->getZExtValue(),
-                              Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff);
+    SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                             ConstantSize->getZExtValue(),Align,
+                                       false, DstSV, DstSVOff, SrcSV, SrcSVOff);
     if (Result.getNode())
       return Result;
   }
@@ -4846,6 +4890,26 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
   return NULL;
 }
 
+/// getDbgValue - Creates a SDDbgValue node.
+///
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, N, R, Off, DL, O);
+}
+
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, Value *C, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, C, Off, DL, O);
+}
+
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, FI, Off, DL, O);
+}
+
 namespace {
 
 /// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
@@ -5241,24 +5305,12 @@ unsigned SelectionDAG::GetOrdering(const SDNode *SD) const {
   return Ordering->getOrder(SD);
 }
 
-/// AssignDbgInfo - Assign debug info to the SDNode.
-void SelectionDAG::AssignDbgInfo(SDNode* SD, SDDbgValue* db) {
-  assert(SD && "Trying to assign dbg info to a null node!");
-  DbgInfo->add(SD, db);
-  SD->setHasDebugValue(true);
-}
-
-/// RememberDbgInfo - Remember debug info which is not assigned to an SDNode.
-void SelectionDAG::RememberDbgInfo(SDDbgValue* db) {
-  DbgInfo->add(db);
-}
-
-/// GetDbgInfo - Get the debug info, if any, for the SDNode.
-SDDbgValue* SelectionDAG::GetDbgInfo(const SDNode *SD) {
-  assert(SD && "Trying to get the order of a null node!");
-  if (SD->getHasDebugValue())
-    return DbgInfo->getSDDbgValue(SD);
-  return 0;
+/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
+/// value is produced by SD.
+void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD) {
+  DbgInfo->add(DB, SD);
+  if (SD)
+    SD->setHasDebugValue(true);
 }
 
 //===----------------------------------------------------------------------===//
@@ -6094,8 +6146,20 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   // If this is a GlobalAddress + cst, return the alignment.
   GlobalValue *GV;
   int64_t GVOffset = 0;
-  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset))
-    return MinAlign(GV->getAlignment(), GVOffset);
+  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
+    // If GV has specified alignment, then use it. Otherwise, use the preferred
+    // alignment.
+    unsigned Align = GV->getAlignment();
+    if (!Align) {
+      if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
+        if (GVar->hasInitializer()) {
+          const TargetData *TD = TLI.getTargetData();
+          Align = TD->getPreferredAlignment(GVar);
+        }
+      }
+    }
+    return MinAlign(Align, GVOffset);
+  }
 
   // If this is a direct reference to a stack slot, use information about the
   // stack slot's alignment.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 12096b9a68bb..922c6e8b02a9 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3800,7 +3800,7 @@ SelectionDAGBuilder::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) {
     int FI = SI->second;
 
     if (MachineModuleInfo *MMI = DAG.getMachineModuleInfo())
-      if (MDNode *Dbg = DI.getMetadata("dbg"))
+      if (MDNode *Dbg = DI.getDbgMetadata())
         MMI->setVariableDbgInfo(Variable, FI, Dbg);
     return 0;
   }
@@ -3824,22 +3824,19 @@ SelectionDAGBuilder::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) {
     // debug info exists.
     ++SDNodeOrder;
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V)) {
-      SDDbgValue* dv = new SDDbgValue(Variable, V, Offset, dl, SDNodeOrder);
-      DAG.RememberDbgInfo(dv);
+      DAG.AddDbgValue(DAG.getDbgValue(Variable, V, Offset, dl, SDNodeOrder));
     } else {
       SDValue &N = NodeMap[V];
-      if (N.getNode()) {
-        SDDbgValue *dv = new SDDbgValue(Variable, N.getNode(),
-                                        N.getResNo(), Offset, dl, SDNodeOrder);
-        DAG.AssignDbgInfo(N.getNode(), dv);
-      } else {
+      if (N.getNode())
+        DAG.AddDbgValue(DAG.getDbgValue(Variable, N.getNode(),
+                                        N.getResNo(), Offset, dl, SDNodeOrder),
+                        N.getNode());
+      else
         // We may expand this to cover more cases.  One case where we have no
         // data available is an unreferenced parameter; we need this fallback.
-        SDDbgValue* dv = new SDDbgValue(Variable, 
+        DAG.AddDbgValue(DAG.getDbgValue(Variable, 
                                         UndefValue::get(V->getType()),
-                                        Offset, dl, SDNodeOrder);
-        DAG.RememberDbgInfo(dv);
-      }
+                                        Offset, dl, SDNodeOrder));
     }
 
     // Build a debug info table entry.
@@ -3855,7 +3852,7 @@ SelectionDAGBuilder::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) {
       return 0; // VLAs.
     int FI = SI->second;
     if (MachineModuleInfo *MMI = DAG.getMachineModuleInfo())
-      if (MDNode *Dbg = DI.getMetadata("dbg"))
+      if (MDNode *Dbg = DI.getDbgMetadata())
         MMI->setVariableDbgInfo(Variable, FI, Dbg);
     return 0;
   }
@@ -6054,8 +6051,10 @@ void SelectionDAGISel::LowerArguments(BasicBlock *LLVMBB) {
     }
 
     if (!I->use_empty()) {
-      SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
-                                       SDB->getCurDebugLoc());
+      SDValue Res;
+      if (!ArgValues.empty())
+        Res = DAG.getMergeValues(&ArgValues[0], NumValues,
+                                 SDB->getCurDebugLoc());
       SDB->setValue(I, Res);
 
       // If this argument is live outside of the entry block, insert a copy from
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index cbbe4311756e..ea96b2179999 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -61,6 +61,7 @@
 using namespace llvm;
 
 STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
+STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
 
 static cl::opt<bool>
 EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
@@ -365,23 +366,23 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
 
 /// SetDebugLoc - Update MF's and SDB's DebugLocs if debug information is
 /// attached with this instruction.
-static void SetDebugLoc(unsigned MDDbgKind, Instruction *I,
-                        SelectionDAGBuilder *SDB,
+static void SetDebugLoc(Instruction *I, SelectionDAGBuilder *SDB,
                         FastISel *FastIS, MachineFunction *MF) {
-  if (MDNode *Dbg = I->getMetadata(MDDbgKind)) {
-    DILocation DILoc(Dbg);
-    DebugLoc Loc = ExtractDebugLocation(DILoc, MF->getDebugLocInfo());
+  MDNode *Dbg = I->getDbgMetadata();
+  if (Dbg == 0) return;
+  
+  DILocation DILoc(Dbg);
+  DebugLoc Loc = ExtractDebugLocation(DILoc, MF->getDebugLocInfo());
 
-    SDB->setCurDebugLoc(Loc);
+  SDB->setCurDebugLoc(Loc);
 
-    if (FastIS)
-      FastIS->setCurDebugLoc(Loc);
+  if (FastIS)
+    FastIS->setCurDebugLoc(Loc);
 
-    // If the function doesn't have a default debug location yet, set
-    // it. This is kind of a hack.
-    if (MF->getDefaultDebugLoc().isUnknown())
-      MF->setDefaultDebugLoc(Loc);
-  }
+  // If the function doesn't have a default debug location yet, set
+  // it. This is kind of a hack.
+  if (MF->getDefaultDebugLoc().isUnknown())
+    MF->setDefaultDebugLoc(Loc);
 }
 
 /// ResetDebugLoc - Set MF's and SDB's DebugLocs to Unknown.
@@ -396,12 +397,11 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
                                         BasicBlock::iterator End,
                                         bool &HadTailCall) {
   SDB->setCurrentBasicBlock(BB);
-  unsigned MDDbgKind = LLVMBB->getContext().getMDKindID("dbg");
 
   // Lower all of the non-terminator instructions. If a call is emitted
   // as a tail call, cease emitting nodes for this block.
   for (BasicBlock::iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
-    SetDebugLoc(MDDbgKind, I, SDB, 0, MF);
+    SetDebugLoc(I, SDB, 0, MF);
 
     if (!isa<TerminatorInst>(I)) {
       SDB->visit(*I);
@@ -424,7 +424,7 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
       HandlePHINodesInSuccessorBlocks(LLVMBB);
 
       // Lower the terminator after the copies are emitted.
-      SetDebugLoc(MDDbgKind, LLVMBB->getTerminator(), SDB, 0, MF);
+      SetDebugLoc(LLVMBB->getTerminator(), SDB, 0, MF);
       SDB->visit(*LLVMBB->getTerminator());
       ResetDebugLoc(SDB, 0);
     }
@@ -842,9 +842,6 @@ void SelectionDAGISel::DoInstructionSelection() {
   DEBUG(errs() << "===== Instruction selection ends:\n");
 
   PostprocessISelDAG();
-  
-  // FIXME: This shouldn't be needed, remove it.
-  CurDAG->RemoveDeadNodes();
 }
 
 
@@ -865,8 +862,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
 #endif
                                 );
 
-  unsigned MDDbgKind = Fn.getContext().getMDKindID("dbg");
-
   // Iterate over all basic blocks in the function.
   for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
     BasicBlock *LLVMBB = &*I;
@@ -964,7 +959,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
             break;
           }
 
-        SetDebugLoc(MDDbgKind, BI, SDB, FastIS, &MF);
+        SetDebugLoc(BI, SDB, FastIS, &MF);
 
         // Try to select the instruction with FastISel.
         if (FastIS->SelectInstruction(BI)) {
@@ -1592,8 +1587,9 @@ UpdateChainsAndFlags(SDNode *NodeToMatch, SDValue InputChain,
       assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
       CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain, &ISU);
       
-      // If the node became dead, delete it.
-      if (ChainNode->use_empty())
+      // If the node became dead and we haven't already seen it, delete it.
+      if (ChainNode->use_empty() &&
+          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), ChainNode))
         NowDeadNodes.push_back(ChainNode);
     }
   }
@@ -1614,8 +1610,9 @@ UpdateChainsAndFlags(SDNode *NodeToMatch, SDValue InputChain,
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(FRN, FRN->getNumValues()-1),
                                         InputFlag, &ISU);
       
-      // If the node became dead, delete it.
-      if (FRN->use_empty())
+      // If the node became dead and we haven't already seen it, delete it.
+      if (FRN->use_empty() &&
+          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), FRN))
         NowDeadNodes.push_back(FRN);
     }
   }
@@ -1810,9 +1807,9 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
   // It is possible we're using MorphNodeTo to replace a node with no
   // normal results with one that has a normal result (or we could be
   // adding a chain) and the input could have flags and chains as well.
-  // In this case we need to shifting the operands down.
+  // In this case we need to shift the operands down.
   // FIXME: This is a horrible hack and broken in obscure cases, no worse
-  // than the old isel though.  We should sink this into MorphNodeTo.
+  // than the old isel though.
   int OldFlagResultNo = -1, OldChainResultNo = -1;
 
   unsigned NTMNumResults = Node->getNumValues();
@@ -1888,7 +1885,9 @@ CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 ALWAYS_INLINE static bool
 CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
             SDNode *N) {
-  return N->getOpcode() == MatcherTable[MatcherIndex++];
+  uint16_t Opc = MatcherTable[MatcherIndex++];
+  Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+  return N->getOpcode() == Opc;
 }
 
 ALWAYS_INLINE static bool
@@ -2142,7 +2141,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       if (CaseSize == 0) break;
 
       // Get the opcode, add the index to the table.
-      unsigned Opc = MatcherTable[Idx++];
+      uint16_t Opc = MatcherTable[Idx++];
+      Opc |= (unsigned short)MatcherTable[Idx++] << 8;
       if (Opc >= OpcodeOffset.size())
         OpcodeOffset.resize((Opc+1)*2);
       OpcodeOffset[Opc] = Idx;
@@ -2181,6 +2181,9 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         
         FailIndex = MatcherIndex+NumToSkip;
         
+        unsigned MatcherIndexOfPredicate = MatcherIndex;
+        (void)MatcherIndexOfPredicate; // silence warning.
+        
         // If we can't evaluate this predicate without pushing a scope (e.g. if
         // it is a 'MoveParent') or if the predicate succeeds on this node, we
         // push the scope and evaluate the full predicate chain.
@@ -2190,9 +2193,10 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         if (!Result)
           break;
         
-        DEBUG(errs() << "  Skipped scope entry at index " << MatcherIndex
-              << " continuing at " << FailIndex << "\n");
-
+        DEBUG(errs() << "  Skipped scope entry (due to false predicate) at "
+                     << "index " << MatcherIndexOfPredicate
+                     << ", continuing at " << FailIndex << "\n");
+        ++NumDAGIselRetries;
         
         // Otherwise, we know that this case of the Scope is guaranteed to fail,
         // move to the next case.
@@ -2298,8 +2302,11 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
           CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
         if (CaseSize == 0) break;
 
+        uint16_t Opc = MatcherTable[MatcherIndex++];
+        Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+
         // If the opcode matches, then we will execute this case.
-        if (CurNodeOpcode == MatcherTable[MatcherIndex++])
+        if (CurNodeOpcode == Opc)
           break;
       
         // Otherwise, skip over this case.
@@ -2428,6 +2435,35 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       continue;
     }
         
+    case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
+    case OPC_EmitMergeInputChains1_1: {  // OPC_EmitMergeInputChains, 1, 1
+      // These are space-optimized forms of OPC_EmitMergeInputChains.
+      assert(InputChain.getNode() == 0 &&
+             "EmitMergeInputChains should be the first chain producing node");
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+      
+      // Read all of the chained nodes.
+      unsigned RecNo = Opcode == OPC_EmitMergeInputChains1_1;
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      ChainNodesMatched.push_back(RecordedNodes[RecNo].getNode());
+        
+      // FIXME: What if other value results of the node have uses not matched
+      // by this pattern?
+      if (ChainNodesMatched.back() != NodeToMatch &&
+          !RecordedNodes[RecNo].hasOneUse()) {
+        ChainNodesMatched.clear();
+        break;
+      }
+      
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+      
+      if (InputChain.getNode() == 0)
+        break;  // Failed to merge.
+      continue;
+    }
+        
     case OPC_EmitMergeInputChains: {
       assert(InputChain.getNode() == 0 &&
              "EmitMergeInputChains should be the first chain producing node");
@@ -2646,14 +2682,10 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         assert(ResSlot < RecordedNodes.size() && "Invalid CheckSame");
         SDValue Res = RecordedNodes[ResSlot];
         
-        // FIXME2: Eliminate this horrible hack by fixing the 'Gen' program
-        // after (parallel) on input patterns are removed.  This would also
-        // allow us to stop encoding #results in OPC_CompleteMatch's table
-        // entry.
-        if (NodeToMatch->getNumValues() <= i ||
-            NodeToMatch->getValueType(i) == MVT::Other ||
-            NodeToMatch->getValueType(i) == MVT::Flag)
-          break;
+        assert(i < NodeToMatch->getNumValues() &&
+               NodeToMatch->getValueType(i) != MVT::Other &&
+               NodeToMatch->getValueType(i) != MVT::Flag &&
+               "Invalid number of results to complete!");
         assert((NodeToMatch->getValueType(i) == Res.getValueType() ||
                 NodeToMatch->getValueType(i) == MVT::iPTR ||
                 Res.getValueType() == MVT::iPTR ||
@@ -2685,6 +2717,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     // another child to try in the current 'Scope', otherwise pop it until we
     // find a case to check.
     DEBUG(errs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
+    ++NumDAGIselRetries;
     while (1) {
       if (MatchScopes.empty()) {
         CannotYetSelect(NodeToMatch);
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f7ef2d640b44..ea2ff2f36891 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -186,11 +186,13 @@ static void InitLibcallNames(const char **Names) {
   Names[RTLIB::FPROUND_PPCF128_F32] = "__trunctfsf2";
   Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
   Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
-  Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfi8";
-  Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfi16";
+  Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfqi";
+  Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfhi";
   Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
   Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
   Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
+  Names[RTLIB::FPTOSINT_F64_I8] = "__fixdfqi";
+  Names[RTLIB::FPTOSINT_F64_I16] = "__fixdfhi";
   Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
   Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
   Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
@@ -200,11 +202,13 @@ static void InitLibcallNames(const char **Names) {
   Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
   Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
   Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
-  Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfi8";
-  Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfi16";
+  Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfqi";
+  Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfhi";
   Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
   Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
   Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
+  Names[RTLIB::FPTOUINT_F64_I8] = "__fixunsdfqi";
+  Names[RTLIB::FPTOUINT_F64_I16] = "__fixunsdfhi";
   Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
   Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
   Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
@@ -314,6 +318,10 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::i128)
       return FPTOSINT_F32_I128;
   } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOSINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOSINT_F64_I16;
     if (RetVT == MVT::i32)
       return FPTOSINT_F64_I32;
     if (RetVT == MVT::i64)
@@ -353,6 +361,10 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::i128)
       return FPTOUINT_F32_I128;
   } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOUINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOUINT_F64_I16;
     if (RetVT == MVT::i32)
       return FPTOUINT_F64_I32;
     if (RetVT == MVT::i64)
@@ -475,7 +487,6 @@ TargetLowering::TargetLowering(TargetMachine &tm,TargetLoweringObjectFile *tlof)
   memset(LoadExtActions, 0, sizeof(LoadExtActions));
   memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
   memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
-  memset(ConvertActions, 0, sizeof(ConvertActions));
   memset(CondCodeActions, 0, sizeof(CondCodeActions));
 
   // Set default actions for various operations.
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index 97e858f09c82..15ca3740b8f2 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -59,11 +59,6 @@ DisableCrossClassJoin("disable-cross-class-join",
                cl::desc("Avoid coalescing cross register class copies"),
                cl::init(false), cl::Hidden);
 
-static cl::opt<bool>
-PhysJoinTweak("tweak-phys-join-heuristics",
-               cl::desc("Tweak heuristics for joining phys reg with vr"),
-               cl::init(false), cl::Hidden);
-
 static RegisterPass<SimpleRegisterCoalescing>
 X("simple-register-coalescing", "Simple Register Coalescing");
 
@@ -1445,7 +1440,6 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
   const TargetRegisterClass *SrcRC= SrcIsPhys ? 0 : mri_->getRegClass(SrcReg);
   const TargetRegisterClass *DstRC= DstIsPhys ? 0 : mri_->getRegClass(DstReg);
   const TargetRegisterClass *NewRC = NULL;
-  MachineBasicBlock *CopyMBB = CopyMI->getParent();
   unsigned RealDstReg = 0;
   unsigned RealSrcReg = 0;
   if (isExtSubReg || isInsSubReg || isSubRegToReg) {
@@ -1646,8 +1640,8 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
   else if (RealSrcReg)
     SavedLI.reset(li_->dupInterval(&DstInt));
 
-  // Check if it is necessary to propagate "isDead" property.
   if (!isExtSubReg && !isInsSubReg && !isSubRegToReg) {
+    // Check if it is necessary to propagate "isDead" property.
     MachineOperand *mopd = CopyMI->findRegisterDefOperand(DstReg, false);
     bool isDead = mopd->isDead();
 
@@ -1656,60 +1650,42 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
     // these are not spillable! If the destination interval uses are far away,
     // think twice about coalescing them!
     if (!isDead && (SrcIsPhys || DstIsPhys)) {
-      // If the copy is in a loop, take care not to coalesce aggressively if the
-      // src is coming in from outside the loop (or the dst is out of the loop).
-      // If it's not in a loop, then determine whether to join them base purely
-      // by the length of the interval.
-      if (PhysJoinTweak) {
-        if (SrcIsPhys) {
-          if (!isWinToJoinVRWithSrcPhysReg(CopyMI, CopyMBB, DstInt, SrcInt)) {
-            mri_->setRegAllocationHint(DstInt.reg, 0, SrcReg);
-            ++numAborts;
-            DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
-            Again = true;  // May be possible to coalesce later.
-            return false;
-          }
-        } else {
-          if (!isWinToJoinVRWithDstPhysReg(CopyMI, CopyMBB, DstInt, SrcInt)) {
-            mri_->setRegAllocationHint(SrcInt.reg, 0, DstReg);
-            ++numAborts;
-            DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
-            Again = true;  // May be possible to coalesce later.
-            return false;
-          }
-        }
-      } else {
-        // If the virtual register live interval is long but it has low use
-        // density, do not join them, instead mark the physical register as its
-        // allocation preference.
-        LiveInterval &JoinVInt = SrcIsPhys ? DstInt : SrcInt;
-        LiveInterval &JoinPInt = SrcIsPhys ? SrcInt : DstInt;
-        unsigned JoinVReg = SrcIsPhys ? DstReg : SrcReg;
-        unsigned JoinPReg = SrcIsPhys ? SrcReg : DstReg;
+      // If the virtual register live interval is long but it has low use
+      // density, do not join them, instead mark the physical register as its
+      // allocation preference.
+      LiveInterval &JoinVInt = SrcIsPhys ? DstInt : SrcInt;
+      LiveInterval &JoinPInt = SrcIsPhys ? SrcInt : DstInt;
+      unsigned JoinVReg = SrcIsPhys ? DstReg : SrcReg;
+      unsigned JoinPReg = SrcIsPhys ? SrcReg : DstReg;
 
-        // Don't join with physregs that have a ridiculous number of live
-        // ranges. The data structure performance is really bad when that
-        // happens.
-        if (JoinPInt.ranges.size() > 1000) {
-          mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
-          ++numAborts;
-          DEBUG(dbgs() << "\tPhysical register too complicated, abort!\n");
-          return false;
-        }
+      // Don't join with physregs that have a ridiculous number of live
+      // ranges. The data structure performance is really bad when that
+      // happens.
+      if (JoinPInt.ranges.size() > 1000) {
+        mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
+        ++numAborts;
+        DEBUG(dbgs()
+              << "\tPhysical register live interval too complicated, abort!\n");
+        return false;
+      }
 
-        const TargetRegisterClass *RC = mri_->getRegClass(JoinVReg);
-        unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
-        unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
-        float Ratio = 1.0 / Threshold;
-        if (Length > Threshold &&
-            (((float)std::distance(mri_->use_nodbg_begin(JoinVReg),
-                                   mri_->use_nodbg_end()) / Length) < Ratio)) {
-          mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
-          ++numAborts;
-          DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
-          Again = true;  // May be possible to coalesce later.
-          return false;
-        }
+      const TargetRegisterClass *RC = mri_->getRegClass(JoinVReg);
+      unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+      unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
+      float Ratio = 1.0 / Threshold;
+      if (Length > Threshold &&
+          (((float)std::distance(mri_->use_nodbg_begin(JoinVReg),
+                                 mri_->use_nodbg_end()) / Length) < Ratio)) {
+        // Before giving up coalescing, if definition of source is defined by
+        // trivial computation, try rematerializing it.
+        if (ReMaterializeTrivialDef(SrcInt, DstReg, DstSubIdx, CopyMI))
+          return true;
+
+        mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
+        ++numAborts;
+        DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
+        Again = true;  // May be possible to coalesce later.
+        return false;
       }
     }
   }
@@ -1720,16 +1696,15 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
   // been modified, so we can use this information below to update aliases.
   bool Swapped = false;
   // If SrcInt is implicitly defined, it's safe to coalesce.
-  bool isEmpty = SrcInt.empty();
-  if (isEmpty && !CanCoalesceWithImpDef(CopyMI, DstInt, SrcInt)) {
-    // Only coalesce an empty interval (defined by implicit_def) with
-    // another interval which has a valno defined by the CopyMI and the CopyMI
-    // is a kill of the implicit def.
-    DEBUG(dbgs() << "Not profitable!\n");
-    return false;
-  }
-
-  if (!isEmpty && !JoinIntervals(DstInt, SrcInt, Swapped)) {
+  if (SrcInt.empty()) {
+    if (!CanCoalesceWithImpDef(CopyMI, DstInt, SrcInt)) {
+      // Only coalesce an empty interval (defined by implicit_def) with
+      // another interval which has a valno defined by the CopyMI and the CopyMI
+      // is a kill of the implicit def.
+      DEBUG(dbgs() << "Not profitable!\n");
+      return false;
+    }
+  } else if (!JoinIntervals(DstInt, SrcInt, Swapped)) {
     // Coalescing failed.
 
     // If definition of source is defined by trivial computation, try
@@ -2800,7 +2775,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
           if (MO.isDead())
             continue;
           if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-              !mri_->use_empty(Reg)) {
+              !mri_->use_nodbg_empty(Reg)) {
             isDead = false;
             break;
           }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index b62cca3073d2..d6bdb1040f3e 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -406,7 +406,7 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
 
     // Add information about the stub reference to ELFMMI so that the stub
     // gets emitted by the asmprinter.
-    MCSymbol *SSym = getContext().GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
       MCSymbol *Sym = Mang->getSymbol(GV);
@@ -759,7 +759,7 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
 
     // Add information about the stub reference to MachOMMI so that the stub
     // gets emitted by the asmprinter.
-    MCSymbol *SSym = getContext().GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
       MCSymbol *Sym = Mang->getSymbol(GV);
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c840b3968cd3..c288ae0816d0 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -188,8 +188,9 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
 
   // Find the instruction that kills SavedReg.
   MachineInstr *KillMI = NULL;
-  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SavedReg),
-         UE = MRI->use_end(); UI != UE; ++UI) {
+  for (MachineRegisterInfo::use_nodbg_iterator
+         UI = MRI->use_nodbg_begin(SavedReg),
+         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
     MachineOperand &UseMO = UI.getOperand();
     if (!UseMO.isKill())
       continue;
@@ -280,8 +281,8 @@ TwoAddressInstructionPass::isProfitableToReMat(unsigned Reg,
                                          MachineInstr *MI, MachineInstr *DefMI,
                                          MachineBasicBlock *MBB, unsigned Loc) {
   bool OtherUse = false;
-  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
-         UE = MRI->use_end(); UI != UE; ++UI) {
+  for (MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(Reg),
+         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
     MachineOperand &UseMO = UI.getOperand();
     MachineInstr *UseMI = UseMO.getParent();
     MachineBasicBlock *UseMBB = UseMI->getParent();
@@ -927,6 +928,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
         mi = nmi;
         continue;
       }
+
       const TargetInstrDesc &TID = mi->getDesc();
       bool FirstTied = true;
 
@@ -1101,7 +1103,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
   // Some remat'ed instructions are dead.
   int VReg = ReMatRegs.find_first();
   while (VReg != -1) {
-    if (MRI->use_empty(VReg)) {
+    if (MRI->use_nodbg_empty(VReg)) {
       MachineInstr *DefMI = MRI->getVRegDef(VReg);
       DefMI->eraseFromParent();
     }
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index 44d5311afd44..0b7fde7ec8f6 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -572,6 +572,9 @@ static bool InvalidateRegDef(MachineBasicBlock::iterator I,
 static void UpdateKills(MachineInstr &MI, const TargetRegisterInfo* TRI,
                         BitVector &RegKills,
                         std::vector<MachineOperand*> &KillOps) {
+  // These do not affect kill info at all.
+  if (MI.isDebugValue())
+    return;
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse() || MO.isUndef())
@@ -987,10 +990,17 @@ static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
   SmallVector<unsigned, 4> Kills;
 
   // Take a look at 2 instructions at most.
-  for (unsigned Count = 0; Count < 2; ++Count) {
+  unsigned Count = 0;
+  while (Count < 2) {
     if (MII == MBB.begin())
       break;
     MachineInstr *PrevMI = prior(MII);
+    MII = PrevMI;
+
+    if (PrevMI->isDebugValue())
+      continue; // Skip over dbg_value instructions.
+    ++Count;
+
     for (unsigned i = 0, e = PrevMI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = PrevMI->getOperand(i);
       if (!MO.isReg() || MO.getReg() == 0)
@@ -1019,8 +1029,6 @@ static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
       for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
         Uses.set(*AS);
     }
-
-    MII = PrevMI;
   }
 
   return 0;
@@ -1210,6 +1218,9 @@ OptimizeByUnfold2(unsigned VirtReg, int SS,
                   std::vector<MachineOperand*> &KillOps) {
 
   MachineBasicBlock::iterator NextMII = llvm::next(MII);
+  // Skip over dbg_value instructions.
+  while (NextMII != MBB->end() && NextMII->isDebugValue())
+    NextMII = llvm::next(NextMII);
   if (NextMII == MBB->end())
     return false;
 
@@ -1274,6 +1285,9 @@ OptimizeByUnfold2(unsigned VirtReg, int SS,
     VRM->RemoveMachineInstrFromMaps(&NextMI);
     MBB->erase(&NextMI);
     ++NumModRefUnfold;
+    // Skip over dbg_value instructions.
+    while (NextMII != MBB->end() && NextMII->isDebugValue())
+      NextMII = llvm::next(NextMII);
     if (NextMII == MBB->end())
       break;
   } while (FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, *VRM));
@@ -1619,7 +1633,7 @@ TransferDeadness(unsigned Reg, BitVector &RegKills,
   for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(Reg),
          RE = MRI->reg_end(); RI != RE; ++RI) {
     MachineInstr *UDMI = &*RI;
-    if (UDMI->getParent() != MBB)
+    if (UDMI->isDebugValue() || UDMI->getParent() != MBB)
       continue;
     DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UDMI);
     if (DI == DistanceMap.end())
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index b2e2a04084c4..da21c2de9aa4 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -66,10 +66,39 @@ ExecutionEngine::~ExecutionEngine() {
     delete Modules[i];
 }
 
+namespace {
+// This class automatically deletes the memory block when the GlobalVariable is
+// destroyed.
+class GVMemoryBlock : public CallbackVH {
+  GVMemoryBlock(const GlobalVariable *GV)
+    : CallbackVH(const_cast<GlobalVariable*>(GV)) {}
+
+public:
+  // Returns the address the GlobalVariable should be written into.  The
+  // GVMemoryBlock object prefixes that.
+  static char *Create(const GlobalVariable *GV, const TargetData& TD) {
+    const Type *ElTy = GV->getType()->getElementType();
+    size_t GVSize = (size_t)TD.getTypeAllocSize(ElTy);
+    void *RawMemory = ::operator new(
+      TargetData::RoundUpAlignment(sizeof(GVMemoryBlock),
+                                   TD.getPreferredAlignment(GV))
+      + GVSize);
+    new(RawMemory) GVMemoryBlock(GV);
+    return static_cast<char*>(RawMemory) + sizeof(GVMemoryBlock);
+  }
+
+  virtual void deleted() {
+    // We allocated with operator new and with some extra memory hanging off the
+    // end, so don't just delete this.  I'm not sure if this is actually
+    // required.
+    this->~GVMemoryBlock();
+    ::operator delete(this);
+  }
+};
+}  // anonymous namespace
+
 char* ExecutionEngine::getMemoryForGV(const GlobalVariable* GV) {
-  const Type *ElTy = GV->getType()->getElementType();
-  size_t GVSize = (size_t)getTargetData()->getTypeAllocSize(ElTy);
-  return new char[GVSize];
+  return GVMemoryBlock::Create(GV, *getTargetData());
 }
 
 /// removeModule - Remove a Module from the list of modules.
@@ -221,35 +250,55 @@ const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
   return I != EEState.getGlobalAddressReverseMap(locked).end() ? I->second : 0;
 }
 
-// CreateArgv - Turn a vector of strings into a nice argv style array of
-// pointers to null terminated strings.
-//
-static void *CreateArgv(LLVMContext &C, ExecutionEngine *EE,
-                        const std::vector<std::string> &InputArgv) {
+namespace {
+class ArgvArray {
+  char *Array;
+  std::vector<char*> Values;
+public:
+  ArgvArray() : Array(NULL) {}
+  ~ArgvArray() { clear(); }
+  void clear() {
+    delete[] Array;
+    Array = NULL;
+    for (size_t I = 0, E = Values.size(); I != E; ++I) {
+      delete[] Values[I];
+    }
+    Values.clear();
+  }
+  /// Turn a vector of strings into a nice argv style array of pointers to null
+  /// terminated strings.
+  void *reset(LLVMContext &C, ExecutionEngine *EE,
+              const std::vector<std::string> &InputArgv);
+};
+}  // anonymous namespace
+void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
+                       const std::vector<std::string> &InputArgv) {
+  clear();  // Free the old contents.
   unsigned PtrSize = EE->getTargetData()->getPointerSize();
-  char *Result = new char[(InputArgv.size()+1)*PtrSize];
+  Array = new char[(InputArgv.size()+1)*PtrSize];
 
-  DEBUG(dbgs() << "JIT: ARGV = " << (void*)Result << "\n");
+  DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array << "\n");
   const Type *SBytePtr = Type::getInt8PtrTy(C);
 
   for (unsigned i = 0; i != InputArgv.size(); ++i) {
     unsigned Size = InputArgv[i].size()+1;
     char *Dest = new char[Size];
+    Values.push_back(Dest);
     DEBUG(dbgs() << "JIT: ARGV[" << i << "] = " << (void*)Dest << "\n");
 
     std::copy(InputArgv[i].begin(), InputArgv[i].end(), Dest);
     Dest[Size-1] = 0;
 
-    // Endian safe: Result[i] = (PointerTy)Dest;
-    EE->StoreValueToMemory(PTOGV(Dest), (GenericValue*)(Result+i*PtrSize),
+    // Endian safe: Array[i] = (PointerTy)Dest;
+    EE->StoreValueToMemory(PTOGV(Dest), (GenericValue*)(Array+i*PtrSize),
                            SBytePtr);
   }
 
   // Null terminate it
   EE->StoreValueToMemory(PTOGV(0),
-                         (GenericValue*)(Result+InputArgv.size()*PtrSize),
+                         (GenericValue*)(Array+InputArgv.size()*PtrSize),
                          SBytePtr);
-  return Result;
+  return Array;
 }
 
 
@@ -353,11 +402,13 @@ int ExecutionEngine::runFunctionAsMain(Function *Fn,
    llvm_report_error("Invalid number of arguments of main() supplied");
   }
   
+  ArgvArray CArgv;
+  ArgvArray CEnv;
   if (NumArgs) {
     GVArgs.push_back(GVArgc); // Arg #0 = argc.
     if (NumArgs > 1) {
       // Arg #1 = argv.
-      GVArgs.push_back(PTOGV(CreateArgv(Fn->getContext(), this, argv))); 
+      GVArgs.push_back(PTOGV(CArgv.reset(Fn->getContext(), this, argv)));
       assert(!isTargetNullPtr(this, GVTOP(GVArgs[1])) &&
              "argv[0] was null after CreateArgv");
       if (NumArgs > 2) {
@@ -365,7 +416,7 @@ int ExecutionEngine::runFunctionAsMain(Function *Fn,
         for (unsigned i = 0; envp[i]; ++i)
           EnvVars.push_back(envp[i]);
         // Arg #2 = envp.
-        GVArgs.push_back(PTOGV(CreateArgv(Fn->getContext(), this, EnvVars)));
+        GVArgs.push_back(PTOGV(CEnv.reset(Fn->getContext(), this, EnvVars)));
       }
     }
   }
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 7b061d3ac346..3ba783bb27a7 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -265,6 +265,8 @@ GenericValue Interpreter::callExternalFunction(Function *F,
   if (RF == RawFunctions->end()) {
     RawFn = (RawFunc)(intptr_t)
       sys::DynamicLibrary::SearchForAddressOfSymbol(F->getName());
+    if (!RawFn)
+	RawFn = (RawFunc)(intptr_t)getPointerToGlobalIfAvailable(F);
     if (RawFn != 0)
       RawFunctions->insert(std::make_pair(F, RawFn));  // Cache for later
   } else {
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 2025463a80be..7a23aecf2970 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -623,24 +623,10 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
     AddEncodingComment(Inst);
 
   // Show the MCInst if enabled.
-  if (ShowInst) {
-    raw_ostream &OS = GetCommentOS();
-    OS << "<MCInst #" << Inst.getOpcode();
-    
-    StringRef InstName;
-    if (InstPrinter)
-      InstName = InstPrinter->getOpcodeName(Inst.getOpcode());
-    if (!InstName.empty())
-      OS << ' ' << InstName;
-    
-    for (unsigned i = 0, e = Inst.getNumOperands(); i != e; ++i) {
-      OS << "\n  ";
-      Inst.getOperand(i).print(OS, &MAI);
-    }
-    OS << ">\n";
-  }
+  if (ShowInst)
+    Inst.dump_pretty(GetCommentOS(), &MAI, InstPrinter.get(), "\n ");
   
-  // If we have an AsmPrinter, use that to print, otherwise dump the MCInst.
+  // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
   if (InstPrinter)
     InstPrinter->printInst(&Inst);
   else
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index beecf7e6b858..03b8bd340b4e 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -19,19 +19,26 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmBackend.h"
 
-// FIXME: Gross.
-#include "../Target/X86/X86FixupKinds.h"
-
 #include <vector>
 using namespace llvm;
 
+namespace {
+namespace stats {
 STATISTIC(EmittedFragments, "Number of emitted assembler fragments");
+STATISTIC(EvaluateFixup, "Number of evaluated fixups");
+STATISTIC(FragmentLayouts, "Number of fragment layouts");
+STATISTIC(ObjectBytes, "Number of emitted object file bytes");
+STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps");
+STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
+STATISTIC(SectionLayouts, "Number of section layouts");
+}
+}
 
 // FIXME FIXME FIXME: There are number of places in this file where we convert
 // what is a 64-bit assembler value used for computation into a value in the
@@ -40,13 +47,103 @@ STATISTIC(EmittedFragments, "Number of emitted assembler fragments");
 
 /* *** */
 
+void MCAsmLayout::UpdateForSlide(MCFragment *F, int SlideAmount) {
+  // We shouldn't have to do anything special to support negative slides, and it
+  // is a perfectly valid thing to do as long as other parts of the system are
+  // can guarantee convergence.
+  assert(SlideAmount >= 0 && "Negative slides not yet supported");
+
+  // Update the layout by simply recomputing the layout for the entire
+  // file. This is trivially correct, but very slow.
+  //
+  // FIXME-PERF: This is O(N^2), but will be eliminated once we get smarter.
+
+  // Layout the concrete sections and fragments.
+  MCAssembler &Asm = getAssembler();
+  uint64_t Address = 0;
+  for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it) {
+    // Skip virtual sections.
+    if (Asm.getBackend().isVirtualSection(it->getSection()))
+      continue;
+
+    // Layout the section fragments and its size.
+    Address = Asm.LayoutSection(*it, *this, Address);
+  }
+
+  // Layout the virtual sections.
+  for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it) {
+    if (!Asm.getBackend().isVirtualSection(it->getSection()))
+      continue;
+
+    // Layout the section fragments and its size.
+    Address = Asm.LayoutSection(*it, *this, Address);
+  }
+}
+
+uint64_t MCAsmLayout::getFragmentAddress(const MCFragment *F) const {
+  assert(F->getParent() && "Missing section()!");
+  return getSectionAddress(F->getParent()) + getFragmentOffset(F);
+}
+
+uint64_t MCAsmLayout::getFragmentEffectiveSize(const MCFragment *F) const {
+  assert(F->EffectiveSize != ~UINT64_C(0) && "Address not set!");
+  return F->EffectiveSize;
+}
+
+void MCAsmLayout::setFragmentEffectiveSize(MCFragment *F, uint64_t Value) {
+  F->EffectiveSize = Value;
+}
+
+uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
+  assert(F->Offset != ~UINT64_C(0) && "Address not set!");
+  return F->Offset;
+}
+
+void MCAsmLayout::setFragmentOffset(MCFragment *F, uint64_t Value) {
+  F->Offset = Value;
+}
+
+uint64_t MCAsmLayout::getSymbolAddress(const MCSymbolData *SD) const {
+  assert(SD->getFragment() && "Invalid getAddress() on undefined symbol!");
+  return getFragmentAddress(SD->getFragment()) + SD->getOffset();
+}
+
+uint64_t MCAsmLayout::getSectionAddress(const MCSectionData *SD) const {
+  assert(SD->Address != ~UINT64_C(0) && "Address not set!");
+  return SD->Address;
+}
+
+void MCAsmLayout::setSectionAddress(MCSectionData *SD, uint64_t Value) {
+  SD->Address = Value;
+}
+
+uint64_t MCAsmLayout::getSectionSize(const MCSectionData *SD) const {
+  assert(SD->Size != ~UINT64_C(0) && "File size not set!");
+  return SD->Size;
+}
+void MCAsmLayout::setSectionSize(MCSectionData *SD, uint64_t Value) {
+  SD->Size = Value;
+}
+
+uint64_t MCAsmLayout::getSectionFileSize(const MCSectionData *SD) const {
+  assert(SD->FileSize != ~UINT64_C(0) && "File size not set!");
+  return SD->FileSize;
+}
+void MCAsmLayout::setSectionFileSize(MCSectionData *SD, uint64_t Value) {
+  SD->FileSize = Value;
+}
+
+  /// @}
+
+/* *** */
+
 MCFragment::MCFragment() : Kind(FragmentType(~0)) {
 }
 
 MCFragment::MCFragment(FragmentType _Kind, MCSectionData *_Parent)
   : Kind(_Kind),
     Parent(_Parent),
-    FileSize(~UINT64_C(0))
+    EffectiveSize(~UINT64_C(0))
 {
   if (Parent)
     Parent->getFragmentList().push_back(this);
@@ -55,11 +152,6 @@ MCFragment::MCFragment(FragmentType _Kind, MCSectionData *_Parent)
 MCFragment::~MCFragment() {
 }
 
-uint64_t MCFragment::getAddress() const {
-  assert(getParent() && "Missing Section!");
-  return getParent()->getAddress() + Offset;
-}
-
 /* *** */
 
 MCSectionData::MCSectionData() : Section(0) {}
@@ -95,7 +187,7 @@ MCSymbolData::MCSymbolData(const MCSymbol &_Symbol, MCFragment *_Fragment,
 MCAssembler::MCAssembler(MCContext &_Context, TargetAsmBackend &_Backend,
                          MCCodeEmitter &_Emitter, raw_ostream &_OS)
   : Context(_Context), Backend(_Backend), Emitter(_Emitter),
-    OS(_OS), SubsectionsViaSymbols(false)
+    OS(_OS), RelaxAll(false), SubsectionsViaSymbols(false)
 {
 }
 
@@ -104,7 +196,6 @@ MCAssembler::~MCAssembler() {
 
 static bool isScatteredFixupFullyResolvedSimple(const MCAssembler &Asm,
                                                 const MCAsmFixup &Fixup,
-                                                const MCDataFragment *DF,
                                                 const MCValue Target,
                                                 const MCSection *BaseSection) {
   // The effective fixup address is
@@ -141,8 +232,8 @@ static bool isScatteredFixupFullyResolvedSimple(const MCAssembler &Asm,
 }
 
 static bool isScatteredFixupFullyResolved(const MCAssembler &Asm,
+                                          const MCAsmLayout &Layout,
                                           const MCAsmFixup &Fixup,
-                                          const MCDataFragment *DF,
                                           const MCValue Target,
                                           const MCSymbolData *BaseSymbol) {
   // The effective fixup address is
@@ -162,7 +253,7 @@ static bool isScatteredFixupFullyResolved(const MCAssembler &Asm,
     if (A->getKind() != MCSymbolRefExpr::VK_None)
       return false;
 
-    A_Base = Asm.getAtom(&Asm.getSymbolData(A->getSymbol()));
+    A_Base = Asm.getAtom(Layout, &Asm.getSymbolData(A->getSymbol()));
     if (!A_Base)
       return false;
   }
@@ -172,7 +263,7 @@ static bool isScatteredFixupFullyResolved(const MCAssembler &Asm,
     if (B->getKind() != MCSymbolRefExpr::VK_None)
       return false;
 
-    B_Base = Asm.getAtom(&Asm.getSymbolData(B->getSymbol()));
+    B_Base = Asm.getAtom(Layout, &Asm.getSymbolData(B->getSymbol()));
     if (!B_Base)
       return false;
   }
@@ -200,9 +291,13 @@ bool MCAssembler::isSymbolLinkerVisible(const MCSymbolData *SD) const {
     SD->getFragment()->getParent()->getSection());
 }
 
-const MCSymbolData *MCAssembler::getAtomForAddress(const MCSectionData *Section,
+// FIXME-PERF: This routine is really slow.
+const MCSymbolData *MCAssembler::getAtomForAddress(const MCAsmLayout &Layout,
+                                                   const MCSectionData *Section,
                                                    uint64_t Address) const {
   const MCSymbolData *Best = 0;
+  uint64_t BestAddress = 0;
+
   for (MCAssembler::const_symbol_iterator it = symbol_begin(),
          ie = symbol_end(); it != ie; ++it) {
     // Ignore non-linker visible symbols.
@@ -215,15 +310,19 @@ const MCSymbolData *MCAssembler::getAtomForAddress(const MCSectionData *Section,
 
     // Otherwise, find the closest symbol preceding this address (ties are
     // resolved in favor of the last defined symbol).
-    if (it->getAddress() <= Address &&
-        (!Best || it->getAddress() >= Best->getAddress()))
+    uint64_t SymbolAddress = Layout.getSymbolAddress(it);
+    if (SymbolAddress <= Address && (!Best || SymbolAddress >= BestAddress)) {
       Best = it;
+      BestAddress = SymbolAddress;
+    }
   }
 
   return Best;
 }
 
-const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
+// FIXME-PERF: This routine is really slow.
+const MCSymbolData *MCAssembler::getAtom(const MCAsmLayout &Layout,
+                                         const MCSymbolData *SD) const {
   // Linker visible symbols define atoms.
   if (isSymbolLinkerVisible(SD))
     return SD;
@@ -233,12 +332,15 @@ const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
     return 0;
 
   // Otherwise, search by address.
-  return getAtomForAddress(SD->getFragment()->getParent(), SD->getAddress());
+  return getAtomForAddress(Layout, SD->getFragment()->getParent(),
+                           Layout.getSymbolAddress(SD));
 }
 
-bool MCAssembler::EvaluateFixup(const MCAsmLayout &Layout, MCAsmFixup &Fixup,
-                                MCDataFragment *DF,
+bool MCAssembler::EvaluateFixup(const MCAsmLayout &Layout,
+                                const MCAsmFixup &Fixup, const MCFragment *DF,
                                 MCValue &Target, uint64_t &Value) const {
+  ++stats::EvaluateFixup;
+
   if (!Fixup.Value->EvaluateAsRelocatable(Target, &Layout))
     llvm_report_error("expected relocatable expression");
 
@@ -253,13 +355,13 @@ bool MCAssembler::EvaluateFixup(const MCAsmLayout &Layout, MCAsmFixup &Fixup,
   bool IsResolved = true;
   if (const MCSymbolRefExpr *A = Target.getSymA()) {
     if (A->getSymbol().isDefined())
-      Value += getSymbolData(A->getSymbol()).getAddress();
+      Value += Layout.getSymbolAddress(&getSymbolData(A->getSymbol()));
     else
       IsResolved = false;
   }
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
     if (B->getSymbol().isDefined())
-      Value -= getSymbolData(B->getSymbol()).getAddress();
+      Value -= Layout.getSymbolAddress(&getSymbolData(B->getSymbol()));
     else
       IsResolved = false;
   }
@@ -273,55 +375,90 @@ bool MCAssembler::EvaluateFixup(const MCAsmLayout &Layout, MCAsmFixup &Fixup,
       const MCSymbolData *BaseSymbol = 0;
       if (IsPCRel) {
         BaseSymbol = getAtomForAddress(
-          DF->getParent(), DF->getAddress() + Fixup.Offset);
+          Layout, DF->getParent(), Layout.getFragmentAddress(DF)+Fixup.Offset);
         if (!BaseSymbol)
           IsResolved = false;
       }
 
       if (IsResolved)
-        IsResolved = isScatteredFixupFullyResolved(*this, Fixup, DF, Target,
+        IsResolved = isScatteredFixupFullyResolved(*this, Layout, Fixup, Target,
                                                    BaseSymbol);
     } else {
       const MCSection *BaseSection = 0;
       if (IsPCRel)
         BaseSection = &DF->getParent()->getSection();
 
-      IsResolved = isScatteredFixupFullyResolvedSimple(*this, Fixup, DF, Target,
+      IsResolved = isScatteredFixupFullyResolvedSimple(*this, Fixup, Target,
                                                        BaseSection);
     }
   }
 
   if (IsPCRel)
-    Value -= DF->getAddress() + Fixup.Offset;
+    Value -= Layout.getFragmentAddress(DF) + Fixup.Offset;
 
   return IsResolved;
 }
 
-void MCAssembler::LayoutSection(MCSectionData &SD) {
-  MCAsmLayout Layout(*this);
-  uint64_t Address = SD.getAddress();
+uint64_t MCAssembler::LayoutSection(MCSectionData &SD,
+                                    MCAsmLayout &Layout,
+                                    uint64_t StartAddress) {
+  bool IsVirtual = getBackend().isVirtualSection(SD.getSection());
 
+  ++stats::SectionLayouts;
+
+  // Align this section if necessary by adding padding bytes to the previous
+  // section. It is safe to adjust this out-of-band, because no symbol or
+  // fragment is allowed to point past the end of the section at any time.
+  if (uint64_t Pad = OffsetToAlignment(StartAddress, SD.getAlignment())) {
+    // Unless this section is virtual (where we are allowed to adjust the offset
+    // freely), the padding goes in the previous section.
+    if (!IsVirtual) {
+      // Find the previous non-virtual section.
+      iterator it = &SD;
+      assert(it != begin() && "Invalid initial section address!");
+      for (--it; getBackend().isVirtualSection(it->getSection()); --it) ;
+      Layout.setSectionFileSize(&*it, Layout.getSectionFileSize(&*it) + Pad);
+    }
+
+    StartAddress += Pad;
+  }
+
+  // Set the aligned section address.
+  Layout.setSectionAddress(&SD, StartAddress);
+
+  uint64_t Address = StartAddress;
   for (MCSectionData::iterator it = SD.begin(), ie = SD.end(); it != ie; ++it) {
     MCFragment &F = *it;
 
-    F.setOffset(Address - SD.getAddress());
+    ++stats::FragmentLayouts;
+
+    uint64_t FragmentOffset = Address - StartAddress;
+    Layout.setFragmentOffset(&F, FragmentOffset);
 
     // Evaluate fragment size.
+    uint64_t EffectiveSize = 0;
     switch (F.getKind()) {
     case MCFragment::FT_Align: {
       MCAlignFragment &AF = cast<MCAlignFragment>(F);
 
-      uint64_t Size = OffsetToAlignment(Address, AF.getAlignment());
-      if (Size > AF.getMaxBytesToEmit())
-        AF.setFileSize(0);
-      else
-        AF.setFileSize(Size);
+      EffectiveSize = OffsetToAlignment(Address, AF.getAlignment());
+      if (EffectiveSize > AF.getMaxBytesToEmit())
+        EffectiveSize = 0;
       break;
     }
 
     case MCFragment::FT_Data:
-    case MCFragment::FT_Fill:
-      F.setFileSize(F.getMaxFileSize());
+      EffectiveSize = cast<MCDataFragment>(F).getContents().size();
+      break;
+
+    case MCFragment::FT_Fill: {
+      MCFillFragment &FF = cast<MCFillFragment>(F);
+      EffectiveSize = FF.getValueSize() * FF.getCount();
+      break;
+    }
+
+    case MCFragment::FT_Inst:
+      EffectiveSize = cast<MCInstFragment>(F).getInstSize();
       break;
 
     case MCFragment::FT_Org: {
@@ -332,12 +469,12 @@ void MCAssembler::LayoutSection(MCSectionData &SD) {
         llvm_report_error("expected assembly-time absolute expression");
 
       // FIXME: We need a way to communicate this error.
-      int64_t Offset = TargetLocation - F.getOffset();
+      int64_t Offset = TargetLocation - FragmentOffset;
       if (Offset < 0)
         llvm_report_error("invalid .org offset '" + Twine(TargetLocation) +
-                          "' (at offset '" + Twine(F.getOffset()) + "'");
+                          "' (at offset '" + Twine(FragmentOffset) + "'");
 
-      F.setFileSize(Offset);
+      EffectiveSize = Offset;
       break;
     }
 
@@ -346,114 +483,66 @@ void MCAssembler::LayoutSection(MCSectionData &SD) {
 
       // Align the fragment offset; it is safe to adjust the offset freely since
       // this is only in virtual sections.
+      //
+      // FIXME: We shouldn't be doing this here.
       Address = RoundUpToAlignment(Address, ZFF.getAlignment());
-      F.setOffset(Address - SD.getAddress());
+      Layout.setFragmentOffset(&F, Address - StartAddress);
 
-      // FIXME: This is misnamed.
-      F.setFileSize(ZFF.getSize());
+      EffectiveSize = ZFF.getSize();
       break;
     }
     }
 
-    Address += F.getFileSize();
+    Layout.setFragmentEffectiveSize(&F, EffectiveSize);
+    Address += EffectiveSize;
   }
 
   // Set the section sizes.
-  SD.setSize(Address - SD.getAddress());
-  if (getBackend().isVirtualSection(SD.getSection()))
-    SD.setFileSize(0);
+  Layout.setSectionSize(&SD, Address - StartAddress);
+  if (IsVirtual)
+    Layout.setSectionFileSize(&SD, 0);
   else
-    SD.setFileSize(Address - SD.getAddress());
-}
+    Layout.setSectionFileSize(&SD, Address - StartAddress);
 
-/// WriteNopData - Write optimal nops to the output file for the \arg Count
-/// bytes.  This returns the number of bytes written.  It may return 0 if
-/// the \arg Count is more than the maximum optimal nops.
-///
-/// FIXME this is X86 32-bit specific and should move to a better place.
-static uint64_t WriteNopData(uint64_t Count, MCObjectWriter *OW) {
-  static const uint8_t Nops[16][16] = {
-    // nop
-    {0x90},
-    // xchg %ax,%ax
-    {0x66, 0x90},
-    // nopl (%[re]ax)
-    {0x0f, 0x1f, 0x00},
-    // nopl 0(%[re]ax)
-    {0x0f, 0x1f, 0x40, 0x00},
-    // nopl 0(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopw 0(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopl 0L(%[re]ax)
-    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
-    // nopl 0L(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    // nopw 0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    // nopw %cs:0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    // nopl 0(%[re]ax,%[re]ax,1)
-    // nopw 0(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x44, 0x00, 0x00,
-     0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopw 0(%[re]ax,%[re]ax,1)
-    // nopw 0(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
-     0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopw 0(%[re]ax,%[re]ax,1)
-    // nopl 0L(%[re]ax) */
-    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
-     0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
-    // nopl 0L(%[re]ax)
-    // nopl 0L(%[re]ax)
-    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
-     0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
-    // nopl 0L(%[re]ax)
-    // nopl 0L(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
-     0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}
-  };
-
-  if (Count > 15)
-    return 0;
-
-  for (uint64_t i = 0; i < Count; i++)
-    OW->Write8(uint8_t(Nops[Count - 1][i]));
-
-  return Count;
+  return Address;
 }
 
 /// WriteFragmentData - Write the \arg F data to the output file.
-static void WriteFragmentData(const MCFragment &F, MCObjectWriter *OW) {
+static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                              const MCFragment &F, MCObjectWriter *OW) {
   uint64_t Start = OW->getStream().tell();
   (void) Start;
 
-  ++EmittedFragments;
+  ++stats::EmittedFragments;
 
   // FIXME: Embed in fragments instead?
+  uint64_t FragmentSize = Layout.getFragmentEffectiveSize(&F);
   switch (F.getKind()) {
   case MCFragment::FT_Align: {
     MCAlignFragment &AF = cast<MCAlignFragment>(F);
-    uint64_t Count = AF.getFileSize() / AF.getValueSize();
+    uint64_t Count = FragmentSize / AF.getValueSize();
 
     // FIXME: This error shouldn't actually occur (the front end should emit
     // multiple .align directives to enforce the semantics it wants), but is
     // severe enough that we want to report it. How to handle this?
-    if (Count * AF.getValueSize() != AF.getFileSize())
+    if (Count * AF.getValueSize() != FragmentSize)
       llvm_report_error("undefined .align directive, value size '" +
                         Twine(AF.getValueSize()) +
                         "' is not a divisor of padding size '" +
-                        Twine(AF.getFileSize()) + "'");
+                        Twine(FragmentSize) + "'");
 
     // See if we are aligning with nops, and if so do that first to try to fill
     // the Count bytes.  Then if that did not fill any bytes or there are any
     // bytes left to fill use the the Value and ValueSize to fill the rest.
+    // If we are aligning with nops, ask that target to emit the right data.
     if (AF.getEmitNops()) {
-      uint64_t NopByteCount = WriteNopData(Count, OW);
-      Count -= NopByteCount;
+      if (!Asm.getBackend().WriteNopData(Count, OW))
+        llvm_report_error("unable to write nop sequence of " +
+                          Twine(Count) + " bytes");
+      break;
     }
 
+    // Otherwise, write out in multiples of the value size.
     for (uint64_t i = 0; i != Count; ++i) {
       switch (AF.getValueSize()) {
       default:
@@ -468,7 +557,9 @@ static void WriteFragmentData(const MCFragment &F, MCObjectWriter *OW) {
   }
 
   case MCFragment::FT_Data: {
-    OW->WriteBytes(cast<MCDataFragment>(F).getContents().str());
+    MCDataFragment &DF = cast<MCDataFragment>(F);
+    assert(FragmentSize == DF.getContents().size() && "Invalid size!");
+    OW->WriteBytes(DF.getContents().str());
     break;
   }
 
@@ -487,10 +578,14 @@ static void WriteFragmentData(const MCFragment &F, MCObjectWriter *OW) {
     break;
   }
 
+  case MCFragment::FT_Inst:
+    llvm_unreachable("unexpected inst fragment after lowering");
+    break;
+
   case MCFragment::FT_Org: {
     MCOrgFragment &OF = cast<MCOrgFragment>(F);
 
-    for (uint64_t i = 0, e = OF.getFileSize(); i != e; ++i)
+    for (uint64_t i = 0, e = FragmentSize; i != e; ++i)
       OW->Write8(uint8_t(OF.getValue()));
 
     break;
@@ -502,14 +597,18 @@ static void WriteFragmentData(const MCFragment &F, MCObjectWriter *OW) {
   }
   }
 
-  assert(OW->getStream().tell() - Start == F.getFileSize());
+  assert(OW->getStream().tell() - Start == FragmentSize);
 }
 
 void MCAssembler::WriteSectionData(const MCSectionData *SD,
+                                   const MCAsmLayout &Layout,
                                    MCObjectWriter *OW) const {
+  uint64_t SectionSize = Layout.getSectionSize(SD);
+  uint64_t SectionFileSize = Layout.getSectionFileSize(SD);
+
   // Ignore virtual sections.
   if (getBackend().isVirtualSection(SD->getSection())) {
-    assert(SD->getFileSize() == 0);
+    assert(SectionFileSize == 0 && "Invalid size for section!");
     return;
   }
 
@@ -518,13 +617,13 @@ void MCAssembler::WriteSectionData(const MCSectionData *SD,
 
   for (MCSectionData::const_iterator it = SD->begin(),
          ie = SD->end(); it != ie; ++it)
-    WriteFragmentData(*it, OW);
+    WriteFragmentData(*this, Layout, *it, OW);
 
   // Add section padding.
-  assert(SD->getFileSize() >= SD->getSize() && "Invalid section sizes!");
-  OW->WriteZeros(SD->getFileSize() - SD->getSize());
+  assert(SectionFileSize >= SectionSize && "Invalid section sizes!");
+  OW->WriteZeros(SectionFileSize - SectionSize);
 
-  assert(OW->getStream().tell() - Start == SD->getFileSize());
+  assert(OW->getStream().tell() - Start == SectionFileSize);
 }
 
 void MCAssembler::Finish() {
@@ -532,15 +631,35 @@ void MCAssembler::Finish() {
       llvm::errs() << "assembler backend - pre-layout\n--\n";
       dump(); });
 
+  // Assign section and fragment ordinals, all subsequent backend code is
+  // responsible for updating these in place.
+  unsigned SectionIndex = 0;
+  unsigned FragmentIndex = 0;
+  for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
+    it->setOrdinal(SectionIndex++);
+
+    for (MCSectionData::iterator it2 = it->begin(),
+           ie2 = it->end(); it2 != ie2; ++it2)
+      it2->setOrdinal(FragmentIndex++);
+  }
+
   // Layout until everything fits.
-  while (LayoutOnce())
+  MCAsmLayout Layout(*this);
+  while (LayoutOnce(Layout))
     continue;
 
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - post-layout\n--\n";
+      llvm::errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
-  // FIXME: Factor out MCObjectWriter.
+  // Finalize the layout, including fragment lowering.
+  FinishLayout(Layout);
+
+  DEBUG_WITH_TYPE("mc-dump", {
+      llvm::errs() << "assembler backend - final-layout\n--\n";
+      dump(); });
+
+  uint64_t StartOffset = OS.tell();
   llvm::OwningPtr<MCObjectWriter> Writer(getBackend().createObjectWriter(OS));
   if (!Writer)
     llvm_report_error("unable to create object writer!");
@@ -550,9 +669,6 @@ void MCAssembler::Finish() {
   Writer->ExecutePostLayoutBinding(*this);
 
   // Evaluate and apply the fixups, generating relocation entries as necessary.
-  //
-  // FIXME: Share layout object.
-  MCAsmLayout Layout(*this);
   for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
     for (MCSectionData::iterator it2 = it->begin(),
            ie2 = it->end(); it2 != ie2; ++it2) {
@@ -571,7 +687,7 @@ void MCAssembler::Finish() {
           // The fixup was unresolved, we need a relocation. Inform the object
           // writer of the relocation, and give it an opportunity to adjust the
           // fixup value if need be.
-          Writer->RecordRelocation(*this, *DF, Fixup, Target, FixedValue);
+          Writer->RecordRelocation(*this, Layout, DF, Fixup, Target,FixedValue);
         }
 
         getBackend().ApplyFixup(Fixup, *DF, FixedValue);
@@ -580,17 +696,17 @@ void MCAssembler::Finish() {
   }
 
   // Write the object file.
-  Writer->WriteObject(*this);
+  Writer->WriteObject(*this, Layout);
   OS.flush();
+
+  stats::ObjectBytes += OS.tell() - StartOffset;
 }
 
-bool MCAssembler::FixupNeedsRelaxation(MCAsmFixup &Fixup, MCDataFragment *DF) {
-  // FIXME: Share layout object.
-  MCAsmLayout Layout(*this);
-
-  // Currently we only need to relax X86::reloc_pcrel_1byte.
-  if (unsigned(Fixup.Kind) != X86::reloc_pcrel_1byte)
-    return false;
+bool MCAssembler::FixupNeedsRelaxation(const MCAsmFixup &Fixup,
+                                       const MCFragment *DF,
+                                       const MCAsmLayout &Layout) const {
+  if (getRelaxAll())
+    return true;
 
   // If we cannot resolve the fixup value, it requires relaxation.
   MCValue Target;
@@ -602,135 +718,141 @@ bool MCAssembler::FixupNeedsRelaxation(MCAsmFixup &Fixup, MCDataFragment *DF) {
   return int64_t(Value) != int64_t(int8_t(Value));
 }
 
-bool MCAssembler::LayoutOnce() {
+bool MCAssembler::FragmentNeedsRelaxation(const MCInstFragment *IF,
+                                          const MCAsmLayout &Layout) const {
+  // If this inst doesn't ever need relaxation, ignore it. This occurs when we
+  // are intentionally pushing out inst fragments, or because we relaxed a
+  // previous instruction to one that doesn't need relaxation.
+  if (!getBackend().MayNeedRelaxation(IF->getInst(), IF->getFixups()))
+    return false;
+
+  for (MCInstFragment::const_fixup_iterator it = IF->fixup_begin(),
+         ie = IF->fixup_end(); it != ie; ++it)
+    if (FixupNeedsRelaxation(*it, IF, Layout))
+      return true;
+
+  return false;
+}
+
+bool MCAssembler::LayoutOnce(MCAsmLayout &Layout) {
+  ++stats::RelaxationSteps;
+
   // Layout the concrete sections and fragments.
   uint64_t Address = 0;
-  MCSectionData *Prev = 0;
   for (iterator it = begin(), ie = end(); it != ie; ++it) {
-    MCSectionData &SD = *it;
-
     // Skip virtual sections.
-    if (getBackend().isVirtualSection(SD.getSection()))
+    if (getBackend().isVirtualSection(it->getSection()))
       continue;
 
-    // Align this section if necessary by adding padding bytes to the previous
-    // section.
-    if (uint64_t Pad = OffsetToAlignment(Address, it->getAlignment())) {
-      assert(Prev && "Missing prev section!");
-      Prev->setFileSize(Prev->getFileSize() + Pad);
-      Address += Pad;
-    }
-
     // Layout the section fragments and its size.
-    SD.setAddress(Address);
-    LayoutSection(SD);
-    Address += SD.getFileSize();
-
-    Prev = &SD;
+    Address = LayoutSection(*it, Layout, Address);
   }
 
   // Layout the virtual sections.
   for (iterator it = begin(), ie = end(); it != ie; ++it) {
-    MCSectionData &SD = *it;
-
-    if (!getBackend().isVirtualSection(SD.getSection()))
+    if (!getBackend().isVirtualSection(it->getSection()))
       continue;
 
-    // Align this section if necessary by adding padding bytes to the previous
-    // section.
-    if (uint64_t Pad = OffsetToAlignment(Address, it->getAlignment()))
-      Address += Pad;
-
-    SD.setAddress(Address);
-    LayoutSection(SD);
-    Address += SD.getSize();
+    // Layout the section fragments and its size.
+    Address = LayoutSection(*it, Layout, Address);
   }
 
-  // Scan the fixups in order and relax any that don't fit.
+  // Scan for fragments that need relaxation.
+  bool WasRelaxed = false;
   for (iterator it = begin(), ie = end(); it != ie; ++it) {
     MCSectionData &SD = *it;
 
     for (MCSectionData::iterator it2 = SD.begin(),
            ie2 = SD.end(); it2 != ie2; ++it2) {
-      MCDataFragment *DF = dyn_cast<MCDataFragment>(it2);
-      if (!DF)
+      // Check if this is an instruction fragment that needs relaxation.
+      MCInstFragment *IF = dyn_cast<MCInstFragment>(it2);
+      if (!IF || !FragmentNeedsRelaxation(IF, Layout))
         continue;
 
-      for (MCDataFragment::fixup_iterator it3 = DF->fixup_begin(),
-             ie3 = DF->fixup_end(); it3 != ie3; ++it3) {
-        MCAsmFixup &Fixup = *it3;
+      ++stats::RelaxedInstructions;
 
-        // Check whether we need to relax this fixup.
-        if (!FixupNeedsRelaxation(Fixup, DF))
-          continue;
+      // FIXME-PERF: We could immediately lower out instructions if we can tell
+      // they are fully resolved, to avoid retesting on later passes.
 
-        // Relax the instruction.
-        //
-        // FIXME: This is a huge temporary hack which just looks for x86
-        // branches; the only thing we need to relax on x86 is
-        // 'X86::reloc_pcrel_1byte'. Once we have MCInst fragments, this will be
-        // replaced by a TargetAsmBackend hook (most likely tblgen'd) to relax
-        // an individual MCInst.
-        SmallVectorImpl<char> &C = DF->getContents();
-        uint64_t PrevOffset = Fixup.Offset;
-        unsigned Amt = 0;
+      // Relax the fragment.
 
-          // jcc instructions
-        if (unsigned(C[Fixup.Offset-1]) >= 0x70 &&
-            unsigned(C[Fixup.Offset-1]) <= 0x7f) {
-          C[Fixup.Offset] = C[Fixup.Offset-1] + 0x10;
-          C[Fixup.Offset-1] = char(0x0f);
-          ++Fixup.Offset;
-          Amt = 4;
+      MCInst Relaxed;
+      getBackend().RelaxInstruction(IF, Relaxed);
 
-          // jmp rel8
-        } else if (C[Fixup.Offset-1] == char(0xeb)) {
-          C[Fixup.Offset-1] = char(0xe9);
-          Amt = 3;
+      // Encode the new instruction.
+      //
+      // FIXME-PERF: If it matters, we could let the target do this. It can
+      // probably do so more efficiently in many cases.
+      SmallVector<MCFixup, 4> Fixups;
+      SmallString<256> Code;
+      raw_svector_ostream VecOS(Code);
+      getEmitter().EncodeInstruction(Relaxed, VecOS, Fixups);
+      VecOS.flush();
 
-        } else
-          llvm_unreachable("unknown 1 byte pcrel instruction!");
-
-        Fixup.Value = MCBinaryExpr::Create(
-          MCBinaryExpr::Sub, Fixup.Value,
-          MCConstantExpr::Create(3, getContext()),
-          getContext());
-        C.insert(C.begin() + Fixup.Offset, Amt, char(0));
-        Fixup.Kind = MCFixupKind(X86::reloc_pcrel_4byte);
-
-        // Update the remaining fixups, which have slid.
-        //
-        // FIXME: This is bad for performance, but will be eliminated by the
-        // move to MCInst specific fragments.
-        ++it3;
-        for (; it3 != ie3; ++it3)
-          it3->Offset += Amt;
-
-        // Update all the symbols for this fragment, which may have slid.
-        //
-        // FIXME: This is really really bad for performance, but will be
-        // eliminated by the move to MCInst specific fragments.
-        for (MCAssembler::symbol_iterator it = symbol_begin(),
-               ie = symbol_end(); it != ie; ++it) {
-          MCSymbolData &SD = *it;
-
-          if (it->getFragment() != DF)
-            continue;
-
-          if (SD.getOffset() > PrevOffset)
-            SD.setOffset(SD.getOffset() + Amt);
-        }
-
-        // Restart layout.
-        //
-        // FIXME: This is O(N^2), but will be eliminated once we have a smart
-        // MCAsmLayout object.
-        return true;
+      // Update the instruction fragment.
+      int SlideAmount = Code.size() - IF->getInstSize();
+      IF->setInst(Relaxed);
+      IF->getCode() = Code;
+      IF->getFixups().clear();
+      for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+        MCFixup &F = Fixups[i];
+        IF->getFixups().push_back(MCAsmFixup(F.getOffset(), *F.getValue(),
+                                             F.getKind()));
       }
+
+      // Update the layout, and remember that we relaxed. If we are relaxing
+      // everything, we can skip this step since nothing will depend on updating
+      // the values.
+      if (!getRelaxAll())
+        Layout.UpdateForSlide(IF, SlideAmount);
+      WasRelaxed = true;
     }
   }
 
-  return false;
+  return WasRelaxed;
+}
+
+void MCAssembler::FinishLayout(MCAsmLayout &Layout) {
+  // Lower out any instruction fragments, to simplify the fixup application and
+  // output.
+  //
+  // FIXME-PERF: We don't have to do this, but the assumption is that it is
+  // cheap (we will mostly end up eliminating fragments and appending on to data
+  // fragments), so the extra complexity downstream isn't worth it. Evaluate
+  // this assumption.
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    MCSectionData &SD = *it;
+
+    for (MCSectionData::iterator it2 = SD.begin(),
+           ie2 = SD.end(); it2 != ie2; ++it2) {
+      MCInstFragment *IF = dyn_cast<MCInstFragment>(it2);
+      if (!IF)
+        continue;
+
+      // Create a new data fragment for the instruction.
+      //
+      // FIXME-PERF: Reuse previous data fragment if possible.
+      MCDataFragment *DF = new MCDataFragment();
+      SD.getFragmentList().insert(it2, DF);
+
+      // Update the data fragments layout data.
+      //
+      // FIXME: Add MCAsmLayout utility for this.
+      DF->setParent(IF->getParent());
+      DF->setOrdinal(IF->getOrdinal());
+      Layout.setFragmentOffset(DF, Layout.getFragmentOffset(IF));
+      Layout.setFragmentEffectiveSize(DF, Layout.getFragmentEffectiveSize(IF));
+
+      // Copy in the data and the fixups.
+      DF->getContents().append(IF->getCode().begin(), IF->getCode().end());
+      for (unsigned i = 0, e = IF->getFixups().size(); i != e; ++i)
+        DF->getFixups().push_back(IF->getFixups()[i]);
+
+      // Delete the instruction fragment and update the iterator.
+      SD.getFragmentList().erase(IF);
+      it2 = DF;
+    }
+  }
 }
 
 // Debugging methods
@@ -749,7 +871,7 @@ void MCFragment::dump() {
   raw_ostream &OS = llvm::errs();
 
   OS << "<MCFragment " << (void*) this << " Offset:" << Offset
-     << " FileSize:" << FileSize;
+     << " EffectiveSize:" << EffectiveSize;
 
   OS << ">";
 }
@@ -801,6 +923,17 @@ void MCFillFragment::dump() {
      << " Count:" << getCount() << ">";
 }
 
+void MCInstFragment::dump() {
+  raw_ostream &OS = llvm::errs();
+
+  OS << "<MCInstFragment ";
+  this->MCFragment::dump();
+  OS << "\n       ";
+  OS << " Inst:";
+  getInst().dump_pretty(OS);
+  OS << ">";
+}
+
 void MCOrgFragment::dump() {
   raw_ostream &OS = llvm::errs();
 
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 37e828252602..e02cbc74622d 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -23,9 +23,12 @@ MCContext::~MCContext() {
   // we don't need to free them here.
 }
 
-MCSymbol *MCContext::GetOrCreateSymbol(StringRef Name, bool isTemporary) {
+MCSymbol *MCContext::GetOrCreateSymbol(StringRef Name) {
   assert(!Name.empty() && "Normal symbols cannot be unnamed!");
   
+  // Determine whether this is an assembler temporary or normal label.
+  bool isTemporary = Name.startswith(MAI.getPrivateGlobalPrefix());
+  
   // Do the lookup and get the entire StringMapEntry.  We want access to the
   // key if we are creating the entry.
   StringMapEntry<MCSymbol*> &Entry = Symbols.GetOrCreateValue(Name);
@@ -38,24 +41,17 @@ MCSymbol *MCContext::GetOrCreateSymbol(StringRef Name, bool isTemporary) {
   return Result; 
 }
 
-MCSymbol *MCContext::GetOrCreateSymbol(const Twine &Name, bool isTemporary) {
+MCSymbol *MCContext::GetOrCreateSymbol(const Twine &Name) {
   SmallString<128> NameSV;
   Name.toVector(NameSV);
-  return GetOrCreateSymbol(NameSV.str(), isTemporary);
+  return GetOrCreateSymbol(NameSV.str());
 }
 
 MCSymbol *MCContext::CreateTempSymbol() {
-  return GetOrCreateTemporarySymbol(Twine(MAI.getPrivateGlobalPrefix()) +
-                                    "tmp" + Twine(NextUniqueID++));
+  return GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix()) +
+                           "tmp" + Twine(NextUniqueID++));
 }
 
-MCSymbol *MCContext::GetOrCreateTemporarySymbol(const Twine &Name) {
-  SmallString<128> NameSV;
-  Name.toVector(NameSV);
-  return GetOrCreateTemporarySymbol(NameSV.str());
-}
-
-
 MCSymbol *MCContext::LookupSymbol(StringRef Name) const {
   return Symbols.lookup(Name);
 }
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 275994431ee0..bc670abc1bbd 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "mcexpr"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -19,6 +21,12 @@
 #include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
+namespace {
+namespace stats {
+STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
+}
+}
+
 void MCExpr::print(raw_ostream &OS) const {
   switch (getKind()) {
   case MCExpr::Target:
@@ -146,12 +154,6 @@ const MCSymbolRefExpr *MCSymbolRefExpr::Create(StringRef Name, VariantKind Kind,
   return Create(Ctx.GetOrCreateSymbol(Name), Kind, Ctx);
 }
 
-const MCSymbolRefExpr *MCSymbolRefExpr::CreateTemp(StringRef Name,
-                                                   VariantKind Kind,
-                                                   MCContext &Ctx) {
-  return Create(Ctx.GetOrCreateTemporarySymbol(Name), Kind, Ctx);
-}
-
 StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   switch (Kind) {
   default:
@@ -194,6 +196,12 @@ void MCTargetExpr::Anchor() {}
 bool MCExpr::EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout *Layout) const {
   MCValue Value;
 
+  // Fast path constants.
+  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(this)) {
+    Res = CE->getValue();
+    return true;
+  }
+
   if (!EvaluateAsRelocatable(Value, Layout) || !Value.isAbsolute())
     return false;
 
@@ -225,6 +233,8 @@ static bool EvaluateSymbolicAdd(const MCValue &LHS,const MCSymbolRefExpr *RHS_A,
 
 bool MCExpr::EvaluateAsRelocatable(MCValue &Res,
                                    const MCAsmLayout *Layout) const {
+  ++stats::MCExprEvaluate;
+
   switch (getKind()) {
   case Target:
     return cast<MCTargetExpr>(this)->EvaluateAsRelocatableImpl(Res, Layout);
@@ -252,8 +262,8 @@ bool MCExpr::EvaluateAsRelocatable(MCValue &Res,
           Layout->getAssembler().getSymbolData(Res.getSymA()->getSymbol());
         MCSymbolData &B =
           Layout->getAssembler().getSymbolData(Res.getSymB()->getSymbol());
-        Res = MCValue::get(+ A.getFragment()->getAddress() + A.getOffset()
-                           - B.getFragment()->getAddress() - B.getOffset()
+        Res = MCValue::get(+ Layout->getSymbolAddress(&A)
+                           - Layout->getSymbolAddress(&B)
                            + Res.getConstant());
       }
 
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index 0634c9faf6fa..de142dc9553e 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -43,6 +44,22 @@ void MCInst::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << ">";
 }
 
+void MCInst::dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI,
+                         const MCInstPrinter *Printer,
+                         StringRef Separator) const {
+  OS << "<MCInst #" << getOpcode();
+
+  // Show the instruction opcode name if we have access to a printer.
+  if (Printer)
+    OS << ' ' << Printer->getOpcodeName(getOpcode());
+
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    OS << Separator;
+    getOperand(i).print(OS, MAI);
+  }
+  OS << ">\n";
+}
+
 void MCInst::dump() const {
   print(dbgs(), 0);
   dbgs() << "\n";
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 9504392bc1bc..120f837b26ff 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -18,6 +18,8 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmBackend.h"
+
 using namespace llvm;
 
 namespace {
@@ -57,6 +59,15 @@ private:
     return 0;
   }
 
+  /// Get a data fragment to write into, creating a new one if the current
+  /// fragment is not a data fragment.
+  MCDataFragment *getOrCreateDataFragment() const {
+    MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+    if (!F)
+      F = new MCDataFragment(CurSectionData);
+    return F;
+  }
+
 public:
   MCMachOStreamer(MCContext &Context, TargetAsmBackend &TAB,
                   raw_ostream &_OS, MCCodeEmitter *_Emitter)
@@ -64,6 +75,8 @@ public:
       CurSectionData(0) {}
   ~MCMachOStreamer() {}
 
+  MCAssembler &getAssembler() { return Assembler; }
+
   const MCExpr *AddValueSymbols(const MCExpr *Value) {
     switch (Value->getKind()) {
     case MCExpr::Target: assert(0 && "Can't handle target exprs yet!");
@@ -150,11 +163,11 @@ void MCMachOStreamer::SwitchSection(const MCSection *Section) {
 void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
-  // FIXME: We should also use offsets into Fill fragments.
-  MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-  if (!F)
-    F = new MCDataFragment(CurSectionData);
-
+  // FIXME: This is wasteful, we don't necessarily need to create a data
+  // fragment. Instead, we should mark the symbol as pointing into the data
+  // fragment if it exists, otherwise we should just queue the label and set its
+  // fragment pointer when we emit the next fragment.
+  MCDataFragment *F = getOrCreateDataFragment();
   MCSymbolData &SD = Assembler.getOrCreateSymbolData(*Symbol);
   assert(!SD.getFragment() && "Unexpected fragment on symbol data!");
   SD.setFragment(F);
@@ -307,17 +320,12 @@ void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
 }
 
 void MCMachOStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
-  MCDataFragment *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-  if (!DF)
-    DF = new MCDataFragment(CurSectionData);
-  DF->getContents().append(Data.begin(), Data.end());
+  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
 }
 
 void MCMachOStreamer::EmitValue(const MCExpr *Value, unsigned Size,
                                 unsigned AddrSpace) {
-  MCDataFragment *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-  if (!DF)
-    DF = new MCDataFragment(CurSectionData);
+  MCDataFragment *DF = getOrCreateDataFragment();
 
   // Avoid fixups when possible.
   int64_t AbsValue;
@@ -349,8 +357,7 @@ void MCMachOStreamer::EmitCodeAlignment(unsigned ByteAlignment,
                                         unsigned MaxBytesToEmit) {
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = ByteAlignment;
-  // FIXME the 0x90 is the default x86 1 byte nop opcode.
-  new MCAlignFragment(ByteAlignment, 0x90, 1, MaxBytesToEmit,
+  new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
                       true /* EmitNops */, CurSectionData);
 
   // Update the maximum alignment on the current section if necessary.
@@ -371,20 +378,54 @@ void MCMachOStreamer::EmitInstruction(const MCInst &Inst) {
 
   CurSectionData->setHasInstructions(true);
 
+  // FIXME-PERF: Common case is that we don't need to relax, encode directly
+  // onto the data fragments buffers.
+
   SmallVector<MCFixup, 4> Fixups;
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
   Assembler.getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
   VecOS.flush();
 
-  // Add the fixups and data.
-  MCDataFragment *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-  if (!DF)
-    DF = new MCDataFragment(CurSectionData);
+  // FIXME: Eliminate this copy.
+  SmallVector<MCAsmFixup, 4> AsmFixups;
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
     MCFixup &F = Fixups[i];
-    DF->addFixup(MCAsmFixup(DF->getContents().size()+F.getOffset(),
-                            *F.getValue(), F.getKind()));
+    AsmFixups.push_back(MCAsmFixup(F.getOffset(), *F.getValue(),
+                                   F.getKind()));
+  }
+
+  // See if we might need to relax this instruction, if so it needs its own
+  // fragment.
+  //
+  // FIXME-PERF: Support target hook to do a fast path that avoids the encoder,
+  // when we can immediately tell that we will get something which might need
+  // relaxation (and compute its size).
+  //
+  // FIXME-PERF: We should also be smart about immediately relaxing instructions
+  // which we can already show will never possibly fit (we can also do a very
+  // good job of this before we do the first relaxation pass, because we have
+  // total knowledge about undefined symbols at that point). Even now, though,
+  // we can do a decent job, especially on Darwin where scattering means that we
+  // are going to often know that we can never fully resolve a fixup.
+  if (Assembler.getBackend().MayNeedRelaxation(Inst, AsmFixups)) {
+    MCInstFragment *IF = new MCInstFragment(Inst, CurSectionData);
+
+    // Add the fixups and data.
+    //
+    // FIXME: Revisit this design decision when relaxation is done, we may be
+    // able to get away with not storing any extra data in the MCInst.
+    IF->getCode() = Code;
+    IF->getFixups() = AsmFixups;
+
+    return;
+  }
+
+  // Add the fixups and data.
+  MCDataFragment *DF = getOrCreateDataFragment();
+  for (unsigned i = 0, e = AsmFixups.size(); i != e; ++i) {
+    AsmFixups[i].Offset += DF->getContents().size();
+    DF->addFixup(AsmFixups[i]);
   }
   DF->getContents().append(Code.begin(), Code.end());
 }
@@ -394,6 +435,10 @@ void MCMachOStreamer::Finish() {
 }
 
 MCStreamer *llvm::createMachOStreamer(MCContext &Context, TargetAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *CE) {
-  return new MCMachOStreamer(Context, TAB, OS, CE);
+                                      raw_ostream &OS, MCCodeEmitter *CE,
+                                      bool RelaxAll) {
+  MCMachOStreamer *S = new MCMachOStreamer(Context, TAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
 }
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 4ec5247d62eb..24616b40b8e3 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -238,9 +238,7 @@ bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 }
 
 MCSymbol *AsmParser::CreateSymbol(StringRef Name) {
-  // If the label starts with L it is an assembler temporary label.
-  if (Name.startswith("L"))
-    return Ctx.GetOrCreateTemporarySymbol(Name);
+  // FIXME: Inline into callers.
   return Ctx.GetOrCreateSymbol(Name);
 }
 
diff --git a/lib/MC/MCSection.cpp b/lib/MC/MCSection.cpp
index 24c89efefc66..f6e96368eb20 100644
--- a/lib/MC/MCSection.cpp
+++ b/lib/MC/MCSection.cpp
@@ -26,7 +26,11 @@ MCSection::~MCSection() {
 
 MCSectionCOFF *MCSectionCOFF::
 Create(StringRef Name, bool IsDirective, SectionKind K, MCContext &Ctx) {
-  return new (Ctx) MCSectionCOFF(Name, IsDirective, K);
+  char *NameCopy = static_cast<char*>(
+    Ctx.Allocate(Name.size(), /*Alignment=*/1));
+  memcpy(NameCopy, Name.data(), Name.size());
+  return new (Ctx) MCSectionCOFF(StringRef(NameCopy, Name.size()),
+                                 IsDirective, K);
 }
 
 void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 4b08c226d144..e073eb5a0d46 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
@@ -271,12 +272,14 @@ public:
     assert(OS.tell() - Start == SegmentLoadCommandSize);
   }
 
-  void WriteSection(const MCAssembler &Asm, const MCSectionData &SD,
-                    uint64_t FileOffset, uint64_t RelocationsStart,
-                    unsigned NumRelocations) {
+  void WriteSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                    const MCSectionData &SD, uint64_t FileOffset,
+                    uint64_t RelocationsStart, unsigned NumRelocations) {
+    uint64_t SectionSize = Layout.getSectionSize(&SD);
+
     // The offset is unused for virtual sections.
     if (Asm.getBackend().isVirtualSection(SD.getSection())) {
-      assert(SD.getFileSize() == 0 && "Invalid file size!");
+      assert(Layout.getSectionFileSize(&SD) == 0 && "Invalid file size!");
       FileOffset = 0;
     }
 
@@ -292,11 +295,11 @@ public:
     WriteBytes(Section.getSectionName(), 16);
     WriteBytes(Section.getSegmentName(), 16);
     if (Is64Bit) {
-      Write64(SD.getAddress()); // address
-      Write64(SD.getSize()); // size
+      Write64(Layout.getSectionAddress(&SD)); // address
+      Write64(SectionSize); // size
     } else {
-      Write32(SD.getAddress()); // address
-      Write32(SD.getSize()); // size
+      Write32(Layout.getSectionAddress(&SD)); // address
+      Write32(SectionSize); // size
     }
     Write32(FileOffset);
 
@@ -372,7 +375,7 @@ public:
     assert(OS.tell() - Start == DysymtabLoadCommandSize);
   }
 
-  void WriteNlist(MachSymbolData &MSD) {
+  void WriteNlist(MachSymbolData &MSD, const MCAsmLayout &Layout) {
     MCSymbolData &Data = *MSD.SymbolData;
     const MCSymbol &Symbol = Data.getSymbol();
     uint8_t Type = 0;
@@ -403,7 +406,7 @@ public:
       if (Symbol.isAbsolute()) {
         llvm_unreachable("FIXME: Not yet implemented!");
       } else {
-        Address = Data.getAddress();
+        Address = Layout.getSymbolAddress(&Data);
       }
     } else if (Data.isCommon()) {
       // Common symbols are encoded with the size in the address
@@ -437,8 +440,22 @@ public:
       Write32(Address);
   }
 
-  void RecordX86_64Relocation(const MCAssembler &Asm,
-                              const MCDataFragment &Fragment,
+  // FIXME: We really need to improve the relocation validation. Basically, we
+  // want to implement a separate computation which evaluates the relocation
+  // entry as the linker would, and verifies that the resultant fixup value is
+  // exactly what the encoder wanted. This will catch several classes of
+  // problems:
+  //
+  //  - Relocation entry bugs, the two algorithms are unlikely to have the same
+  //    exact bug.
+  //
+  //  - Relaxation issues, where we forget to relax something.
+  //
+  //  - Input errors, where something cannot be correctly encoded. 'as' allows
+  //    these through in many cases.
+
+  void RecordX86_64Relocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
                               const MCAsmFixup &Fixup, MCValue Target,
                               uint64_t &FixedValue) {
     unsigned IsPCRel = isFixupKindPCRel(Fixup.Kind);
@@ -446,7 +463,7 @@ public:
     unsigned Log2Size = getFixupKindLog2Size(Fixup.Kind);
 
     // See <reloc.h>.
-    uint32_t Address = Fragment.getOffset() + Fixup.Offset;
+    uint32_t Address = Layout.getFragmentOffset(Fragment) + Fixup.Offset;
     int64_t Value = 0;
     unsigned Index = 0;
     unsigned IsExtern = 0;
@@ -480,11 +497,11 @@ public:
     } else if (Target.getSymB()) { // A - B + constant
       const MCSymbol *A = &Target.getSymA()->getSymbol();
       MCSymbolData &A_SD = Asm.getSymbolData(*A);
-      const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+      const MCSymbolData *A_Base = Asm.getAtom(Layout, &A_SD);
 
       const MCSymbol *B = &Target.getSymB()->getSymbol();
       MCSymbolData &B_SD = Asm.getSymbolData(*B);
-      const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+      const MCSymbolData *B_Base = Asm.getAtom(Layout, &B_SD);
 
       // Neither symbol can be modified.
       if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
@@ -507,8 +524,8 @@ public:
       if (A_Base == B_Base)
         llvm_report_error("unsupported relocation with identical base");
 
-      Value += A_SD.getAddress() - A_Base->getAddress();
-      Value -= B_SD.getAddress() - B_Base->getAddress();
+      Value += Layout.getSymbolAddress(&A_SD) - Layout.getSymbolAddress(A_Base);
+      Value -= Layout.getSymbolAddress(&B_SD) - Layout.getSymbolAddress(B_Base);
 
       Index = A_Base->getIndex();
       IsExtern = 1;
@@ -521,7 +538,7 @@ public:
                    (Log2Size  << 25) |
                    (IsExtern  << 27) |
                    (Type      << 28));
-      Relocations[Fragment.getParent()].push_back(MRE);
+      Relocations[Fragment->getParent()].push_back(MRE);
 
       Index = B_Base->getIndex();
       IsExtern = 1;
@@ -529,7 +546,7 @@ public:
     } else {
       const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
       MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-      const MCSymbolData *Base = Asm.getAtom(&SD);
+      const MCSymbolData *Base = Asm.getAtom(Layout, &SD);
 
       // x86_64 almost always uses external relocations, except when there is no
       // symbol to use as a base address (a local symbol with no preceeding
@@ -540,19 +557,12 @@ public:
 
         // Add the local offset, if needed.
         if (Base != &SD)
-          Value += SD.getAddress() - Base->getAddress();
+          Value += Layout.getSymbolAddress(&SD) - Layout.getSymbolAddress(Base);
       } else {
-        // The index is the section ordinal.
-        //
-        // FIXME: O(N)
-        Index = 1;
-        MCAssembler::const_iterator it = Asm.begin(), ie = Asm.end();
-        for (; it != ie; ++it, ++Index)
-          if (&*it == SD.getFragment()->getParent())
-            break;
-        assert(it != ie && "Unable to find section index!");
+        // The index is the section ordinal (1-based).
+        Index = SD.getFragment()->getParent()->getOrdinal() + 1;
         IsExtern = 0;
-        Value += SD.getAddress();
+        Value += Layout.getSymbolAddress(&SD);
 
         if (IsPCRel)
           Value -= Address + (1 << Log2Size);
@@ -602,9 +612,16 @@ public:
           }
         }
       } else {
-        if (Modifier == MCSymbolRefExpr::VK_GOT)
+        if (Modifier == MCSymbolRefExpr::VK_GOT) {
           Type = RIT_X86_64_GOT;
-        else if (Modifier != MCSymbolRefExpr::VK_None)
+        } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+          // GOTPCREL is allowed as a modifier on non-PCrel instructions, in
+          // which case all we do is set the PCrel bit in the relocation entry;
+          // this is used with exception handling, for example. The source is
+          // required to include any necessary offset directly.
+          Type = RIT_X86_64_GOT;
+          IsPCRel = 1;
+        } else if (Modifier != MCSymbolRefExpr::VK_None)
           llvm_report_error("unsupported symbol modifier in relocation");
         else
           Type = RIT_X86_64_Unsigned;
@@ -622,14 +639,15 @@ public:
                  (Log2Size  << 25) |
                  (IsExtern  << 27) |
                  (Type      << 28));
-    Relocations[Fragment.getParent()].push_back(MRE);
+    Relocations[Fragment->getParent()].push_back(MRE);
   }
 
   void RecordScatteredRelocation(const MCAssembler &Asm,
-                                 const MCFragment &Fragment,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
                                  const MCAsmFixup &Fixup, MCValue Target,
                                  uint64_t &FixedValue) {
-    uint32_t Address = Fragment.getOffset() + Fixup.Offset;
+    uint32_t Address = Layout.getFragmentOffset(Fragment) + Fixup.Offset;
     unsigned IsPCRel = isFixupKindPCRel(Fixup.Kind);
     unsigned Log2Size = getFixupKindLog2Size(Fixup.Kind);
     unsigned Type = RIT_Vanilla;
@@ -642,7 +660,7 @@ public:
       llvm_report_error("symbol '" + A->getName() +
                         "' can not be undefined in a subtraction expression");
 
-    uint32_t Value = A_SD->getAddress();
+    uint32_t Value = Layout.getSymbolAddress(A_SD);
     uint32_t Value2 = 0;
 
     if (const MCSymbolRefExpr *B = Target.getSymB()) {
@@ -658,7 +676,7 @@ public:
       // relocation types from the linkers point of view, this is done solely
       // for pedantic compatibility with 'as'.
       Type = A_SD->isExternal() ? RIT_Difference : RIT_LocalDifference;
-      Value2 = B_SD->getAddress();
+      Value2 = Layout.getSymbolAddress(B_SD);
     }
 
     // Relocations are written out in reverse order, so the PAIR comes first.
@@ -670,7 +688,7 @@ public:
                    (IsPCRel   << 30) |
                    RF_Scattered);
       MRE.Word1 = Value2;
-      Relocations[Fragment.getParent()].push_back(MRE);
+      Relocations[Fragment->getParent()].push_back(MRE);
     }
 
     MachRelocationEntry MRE;
@@ -680,14 +698,14 @@ public:
                  (IsPCRel   << 30) |
                  RF_Scattered);
     MRE.Word1 = Value;
-    Relocations[Fragment.getParent()].push_back(MRE);
+    Relocations[Fragment->getParent()].push_back(MRE);
   }
 
-  void RecordRelocation(const MCAssembler &Asm, const MCDataFragment &Fragment,
-                        const MCAsmFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) {
+  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCAsmFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) {
     if (Is64Bit) {
-      RecordX86_64Relocation(Asm, Fragment, Fixup, Target, FixedValue);
+      RecordX86_64Relocation(Asm, Layout, Fragment, Fixup, Target, FixedValue);
       return;
     }
 
@@ -702,12 +720,12 @@ public:
     if (Target.getSymB() ||
         (Target.getSymA() && !Target.getSymA()->getSymbol().isUndefined() &&
          Offset)) {
-      RecordScatteredRelocation(Asm, Fragment, Fixup, Target, FixedValue);
+      RecordScatteredRelocation(Asm, Layout, Fragment, Fixup,Target,FixedValue);
       return;
     }
 
     // See <reloc.h>.
-    uint32_t Address = Fragment.getOffset() + Fixup.Offset;
+    uint32_t Address = Layout.getFragmentOffset(Fragment) + Fixup.Offset;
     uint32_t Value = 0;
     unsigned Index = 0;
     unsigned IsExtern = 0;
@@ -729,16 +747,9 @@ public:
         Index = SD->getIndex();
         Value = 0;
       } else {
-        // The index is the section ordinal.
-        //
-        // FIXME: O(N)
-        Index = 1;
-        MCAssembler::const_iterator it = Asm.begin(), ie = Asm.end();
-        for (; it != ie; ++it, ++Index)
-          if (&*it == SD->getFragment()->getParent())
-            break;
-        assert(it != ie && "Unable to find section index!");
-        Value = SD->getAddress();
+        // The index is the section ordinal (1-based).
+        Index = SD->getFragment()->getParent()->getOrdinal() + 1;
+        Value = Layout.getSymbolAddress(SD);
       }
 
       Type = RIT_Vanilla;
@@ -752,7 +763,7 @@ public:
                  (Log2Size  << 25) |
                  (IsExtern  << 27) |
                  (Type      << 28));
-    Relocations[Fragment.getParent()].push_back(MRE);
+    Relocations[Fragment->getParent()].push_back(MRE);
   }
 
   void BindIndirectSymbols(MCAssembler &Asm) {
@@ -920,7 +931,7 @@ public:
                        UndefinedSymbolData);
   }
 
-  void WriteObject(const MCAssembler &Asm) {
+  void WriteObject(const MCAssembler &Asm, const MCAsmLayout &Layout) {
     unsigned NumSections = Asm.size();
 
     // The section data starts after the header, the segment load command (and
@@ -948,16 +959,17 @@ public:
     for (MCAssembler::const_iterator it = Asm.begin(),
            ie = Asm.end(); it != ie; ++it) {
       const MCSectionData &SD = *it;
+      uint64_t Address = Layout.getSectionAddress(&SD);
+      uint64_t Size = Layout.getSectionSize(&SD);
+      uint64_t FileSize = Layout.getSectionFileSize(&SD);
 
-      VMSize = std::max(VMSize, SD.getAddress() + SD.getSize());
+      VMSize = std::max(VMSize, Address + Size);
 
       if (Asm.getBackend().isVirtualSection(SD.getSection()))
         continue;
 
-      SectionDataSize = std::max(SectionDataSize,
-                                 SD.getAddress() + SD.getSize());
-      SectionDataFileSize = std::max(SectionDataFileSize,
-                                     SD.getAddress() + SD.getFileSize());
+      SectionDataSize = std::max(SectionDataSize, Address + Size);
+      SectionDataFileSize = std::max(SectionDataFileSize, Address + FileSize);
     }
 
     // The section data is padded to 4 bytes.
@@ -978,8 +990,8 @@ public:
            ie = Asm.end(); it != ie; ++it) {
       std::vector<MachRelocationEntry> &Relocs = Relocations[it];
       unsigned NumRelocs = Relocs.size();
-      uint64_t SectionStart = SectionDataStart + it->getAddress();
-      WriteSection(Asm, *it, SectionStart, RelocTableEnd, NumRelocs);
+      uint64_t SectionStart = SectionDataStart + Layout.getSectionAddress(it);
+      WriteSection(Asm, Layout, *it, SectionStart, RelocTableEnd, NumRelocs);
       RelocTableEnd += NumRelocs * RelocationInfoSize;
     }
 
@@ -1020,7 +1032,7 @@ public:
     // Write the actual section data.
     for (MCAssembler::const_iterator it = Asm.begin(),
            ie = Asm.end(); it != ie; ++it)
-      Asm.WriteSectionData(it, Writer);
+      Asm.WriteSectionData(it, Layout, Writer);
 
     // Write the extra padding.
     WriteZeros(SectionDataPadding);
@@ -1066,11 +1078,11 @@ public:
 
       // Write the symbol table entries.
       for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
-        WriteNlist(LocalSymbolData[i]);
+        WriteNlist(LocalSymbolData[i], Layout);
       for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i)
-        WriteNlist(ExternalSymbolData[i]);
+        WriteNlist(ExternalSymbolData[i], Layout);
       for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
-        WriteNlist(UndefinedSymbolData[i]);
+        WriteNlist(UndefinedSymbolData[i], Layout);
 
       // Write the string table.
       OS << StringTable.str();
@@ -1097,13 +1109,15 @@ void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm) {
 }
 
 void MachObjectWriter::RecordRelocation(const MCAssembler &Asm,
-                                        const MCDataFragment &Fragment,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
                                         const MCAsmFixup &Fixup, MCValue Target,
                                         uint64_t &FixedValue) {
-  ((MachObjectWriterImpl*) Impl)->RecordRelocation(Asm, Fragment, Fixup,
+  ((MachObjectWriterImpl*) Impl)->RecordRelocation(Asm, Layout, Fragment, Fixup,
                                                    Target, FixedValue);
 }
 
-void MachObjectWriter::WriteObject(const MCAssembler &Asm) {
-  ((MachObjectWriterImpl*) Impl)->WriteObject(Asm);
+void MachObjectWriter::WriteObject(const MCAssembler &Asm,
+                                   const MCAsmLayout &Layout) {
+  ((MachObjectWriterImpl*) Impl)->WriteObject(Asm, Layout);
 }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 8f860a646721..485bf4d6c9d2 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -65,7 +65,7 @@ namespace llvm {
      pow(5, power) is
 
        power * 815 / (351 * integerPartWidth) + 1
-       
+
      However, whilst the result may require only this many parts,
      because we are multiplying two values to get it, the
      multiplication may require an extra part with the excess part
@@ -100,15 +100,15 @@ hexDigitValue(unsigned int c)
   unsigned int r;
 
   r = c - '0';
-  if(r <= 9)
+  if (r <= 9)
     return r;
 
   r = c - 'A';
-  if(r <= 5)
+  if (r <= 5)
     return r + 10;
 
   r = c - 'a';
-  if(r <= 5)
+  if (r <= 5)
     return r + 10;
 
   return -1U;
@@ -116,8 +116,8 @@ hexDigitValue(unsigned int c)
 
 static inline void
 assertArithmeticOK(const llvm::fltSemantics &semantics) {
-  assert(semantics.arithmeticOK
-         && "Compile-time arithmetic does not support these semantics");
+  assert(semantics.arithmeticOK &&
+         "Compile-time arithmetic does not support these semantics");
 }
 
 /* Return the value of a decimal exponent of the form
@@ -179,37 +179,37 @@ totalExponent(StringRef::iterator p, StringRef::iterator end,
   assert(p != end && "Exponent has no digits");
 
   negative = *p == '-';
-  if(*p == '-' || *p == '+') {
+  if (*p == '-' || *p == '+') {
     p++;
     assert(p != end && "Exponent has no digits");
   }
 
   unsignedExponent = 0;
   overflow = false;
-  for(; p != end; ++p) {
+  for (; p != end; ++p) {
     unsigned int value;
 
     value = decDigitValue(*p);
     assert(value < 10U && "Invalid character in exponent");
 
     unsignedExponent = unsignedExponent * 10 + value;
-    if(unsignedExponent > 65535)
+    if (unsignedExponent > 65535)
       overflow = true;
   }
 
-  if(exponentAdjustment > 65535 || exponentAdjustment < -65536)
+  if (exponentAdjustment > 65535 || exponentAdjustment < -65536)
     overflow = true;
 
-  if(!overflow) {
+  if (!overflow) {
     exponent = unsignedExponent;
-    if(negative)
+    if (negative)
       exponent = -exponent;
     exponent += exponentAdjustment;
-    if(exponent > 65535 || exponent < -65536)
+    if (exponent > 65535 || exponent < -65536)
       overflow = true;
   }
 
-  if(overflow)
+  if (overflow)
     exponent = negative ? -65536: 65535;
 
   return exponent;
@@ -221,15 +221,15 @@ skipLeadingZeroesAndAnyDot(StringRef::iterator begin, StringRef::iterator end,
 {
   StringRef::iterator p = begin;
   *dot = end;
-  while(*p == '0' && p != end)
+  while (*p == '0' && p != end)
     p++;
 
-  if(*p == '.') {
+  if (*p == '.') {
     *dot = p++;
 
     assert(end - begin != 1 && "Significand has no digits");
 
-    while(*p == '0' && p != end)
+    while (*p == '0' && p != end)
       p++;
   }
 
@@ -323,13 +323,13 @@ trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
 
   /* If the first trailing digit isn't 0 or 8 we can work out the
      fraction immediately.  */
-  if(digitValue > 8)
+  if (digitValue > 8)
     return lfMoreThanHalf;
-  else if(digitValue < 8 && digitValue > 0)
+  else if (digitValue < 8 && digitValue > 0)
     return lfLessThanHalf;
 
   /* Otherwise we need to find the first non-zero digit.  */
-  while(*p == '0')
+  while (*p == '0')
     p++;
 
   assert(p != end && "Invalid trailing hexadecimal fraction!");
@@ -338,7 +338,7 @@ trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
 
   /* If we ran off the end it is exactly zero or one-half, otherwise
      a little more.  */
-  if(hexDigit == -1U)
+  if (hexDigit == -1U)
     return digitValue == 0 ? lfExactlyZero: lfExactlyHalf;
   else
     return digitValue == 0 ? lfLessThanHalf: lfMoreThanHalf;
@@ -356,12 +356,12 @@ lostFractionThroughTruncation(const integerPart *parts,
   lsb = APInt::tcLSB(parts, partCount);
 
   /* Note this is guaranteed true if bits == 0, or LSB == -1U.  */
-  if(bits <= lsb)
+  if (bits <= lsb)
     return lfExactlyZero;
-  if(bits == lsb + 1)
+  if (bits == lsb + 1)
     return lfExactlyHalf;
-  if(bits <= partCount * integerPartWidth
-     && APInt::tcExtractBit(parts, bits - 1))
+  if (bits <= partCount * integerPartWidth &&
+      APInt::tcExtractBit(parts, bits - 1))
     return lfMoreThanHalf;
 
   return lfLessThanHalf;
@@ -385,10 +385,10 @@ static lostFraction
 combineLostFractions(lostFraction moreSignificant,
                      lostFraction lessSignificant)
 {
-  if(lessSignificant != lfExactlyZero) {
-    if(moreSignificant == lfExactlyZero)
+  if (lessSignificant != lfExactlyZero) {
+    if (moreSignificant == lfExactlyZero)
       moreSignificant = lfLessThanHalf;
-    else if(moreSignificant == lfExactlyHalf)
+    else if (moreSignificant == lfExactlyHalf)
       moreSignificant = lfMoreThanHalf;
   }
 
@@ -468,7 +468,7 @@ powerOf5(integerPart *dst, unsigned int power)
                                                   15625, 78125 };
   integerPart pow5s[maxPowerOfFiveParts * 2 + 5];
   pow5s[0] = 78125 * 5;
-  
+
   unsigned int partsCount[16] = { 1 };
   integerPart scratch[maxPowerOfFiveParts], *p1, *p2, *pow5;
   unsigned int result;
@@ -588,14 +588,14 @@ APFloat::initialize(const fltSemantics *ourSemantics)
 
   semantics = ourSemantics;
   count = partCount();
-  if(count > 1)
+  if (count > 1)
     significand.parts = new integerPart[count];
 }
 
 void
 APFloat::freeSignificand()
 {
-  if(partCount() > 1)
+  if (partCount() > 1)
     delete [] significand.parts;
 }
 
@@ -609,7 +609,7 @@ APFloat::assign(const APFloat &rhs)
   exponent = rhs.exponent;
   sign2 = rhs.sign2;
   exponent2 = rhs.exponent2;
-  if(category == fcNormal || category == fcNaN)
+  if (category == fcNormal || category == fcNaN)
     copySignificand(rhs);
 }
 
@@ -683,8 +683,8 @@ APFloat APFloat::makeNaN(const fltSemantics &Sem, bool SNaN, bool Negative,
 APFloat &
 APFloat::operator=(const APFloat &rhs)
 {
-  if(this != &rhs) {
-    if(semantics != rhs.semantics) {
+  if (this != &rhs) {
+    if (semantics != rhs.semantics) {
       freeSignificand();
       initialize(rhs.semantics);
     }
@@ -881,7 +881,7 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
   precision = semantics->precision;
   newPartsCount = partCountForBits(precision * 2);
 
-  if(newPartsCount > 4)
+  if (newPartsCount > 4)
     fullSignificand = new integerPart[newPartsCount];
   else
     fullSignificand = scratch;
@@ -896,7 +896,7 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
   omsb = APInt::tcMSB(fullSignificand, newPartsCount) + 1;
   exponent += rhs.exponent;
 
-  if(addend) {
+  if (addend) {
     Significand savedSignificand = significand;
     const fltSemantics *savedSemantics = semantics;
     fltSemantics extendedSemantics;
@@ -905,18 +905,17 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
 
     /* Normalize our MSB.  */
     extendedPrecision = precision + precision - 1;
-    if(omsb != extendedPrecision)
-      {
-        APInt::tcShiftLeft(fullSignificand, newPartsCount,
-                           extendedPrecision - omsb);
-        exponent -= extendedPrecision - omsb;
-      }
+    if (omsb != extendedPrecision) {
+      APInt::tcShiftLeft(fullSignificand, newPartsCount,
+                         extendedPrecision - omsb);
+      exponent -= extendedPrecision - omsb;
+    }
 
     /* Create new semantics.  */
     extendedSemantics = *semantics;
     extendedSemantics.precision = extendedPrecision;
 
-    if(newPartsCount == 1)
+    if (newPartsCount == 1)
       significand.part = fullSignificand[0];
     else
       significand.parts = fullSignificand;
@@ -928,7 +927,7 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
     lost_fraction = addOrSubtractSignificand(extendedAddend, false);
 
     /* Restore our state.  */
-    if(newPartsCount == 1)
+    if (newPartsCount == 1)
       fullSignificand[0] = significand.part;
     significand = savedSignificand;
     semantics = savedSemantics;
@@ -938,7 +937,7 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
 
   exponent -= (precision - 1);
 
-  if(omsb > precision) {
+  if (omsb > precision) {
     unsigned int bits, significantParts;
     lostFraction lf;
 
@@ -951,7 +950,7 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
 
   APInt::tcAssign(lhsSignificand, fullSignificand, partsCount);
 
-  if(newPartsCount > 4)
+  if (newPartsCount > 4)
     delete [] fullSignificand;
 
   return lost_fraction;
@@ -973,7 +972,7 @@ APFloat::divideSignificand(const APFloat &rhs)
   rhsSignificand = rhs.significandParts();
   partsCount = partCount();
 
-  if(partsCount > 2)
+  if (partsCount > 2)
     dividend = new integerPart[partsCount * 2];
   else
     dividend = scratch;
@@ -981,7 +980,7 @@ APFloat::divideSignificand(const APFloat &rhs)
   divisor = dividend + partsCount;
 
   /* Copy the dividend and divisor as they will be modified in-place.  */
-  for(i = 0; i < partsCount; i++) {
+  for (i = 0; i < partsCount; i++) {
     dividend[i] = lhsSignificand[i];
     divisor[i] = rhsSignificand[i];
     lhsSignificand[i] = 0;
@@ -993,14 +992,14 @@ APFloat::divideSignificand(const APFloat &rhs)
 
   /* Normalize the divisor.  */
   bit = precision - APInt::tcMSB(divisor, partsCount) - 1;
-  if(bit) {
+  if (bit) {
     exponent += bit;
     APInt::tcShiftLeft(divisor, partsCount, bit);
   }
 
   /* Normalize the dividend.  */
   bit = precision - APInt::tcMSB(dividend, partsCount) - 1;
-  if(bit) {
+  if (bit) {
     exponent -= bit;
     APInt::tcShiftLeft(dividend, partsCount, bit);
   }
@@ -1008,15 +1007,15 @@ APFloat::divideSignificand(const APFloat &rhs)
   /* Ensure the dividend >= divisor initially for the loop below.
      Incidentally, this means that the division loop below is
      guaranteed to set the integer bit to one.  */
-  if(APInt::tcCompare(dividend, divisor, partsCount) < 0) {
+  if (APInt::tcCompare(dividend, divisor, partsCount) < 0) {
     exponent--;
     APInt::tcShiftLeft(dividend, partsCount, 1);
     assert(APInt::tcCompare(dividend, divisor, partsCount) >= 0);
   }
 
   /* Long division.  */
-  for(bit = precision; bit; bit -= 1) {
-    if(APInt::tcCompare(dividend, divisor, partsCount) >= 0) {
+  for (bit = precision; bit; bit -= 1) {
+    if (APInt::tcCompare(dividend, divisor, partsCount) >= 0) {
       APInt::tcSubtract(dividend, divisor, 0, partsCount);
       APInt::tcSetBit(lhsSignificand, bit - 1);
     }
@@ -1027,16 +1026,16 @@ APFloat::divideSignificand(const APFloat &rhs)
   /* Figure out the lost fraction.  */
   int cmp = APInt::tcCompare(dividend, divisor, partsCount);
 
-  if(cmp > 0)
+  if (cmp > 0)
     lost_fraction = lfMoreThanHalf;
-  else if(cmp == 0)
+  else if (cmp == 0)
     lost_fraction = lfExactlyHalf;
-  else if(APInt::tcIsZero(dividend, partsCount))
+  else if (APInt::tcIsZero(dividend, partsCount))
     lost_fraction = lfExactlyZero;
   else
     lost_fraction = lfLessThanHalf;
 
-  if(partsCount > 2)
+  if (partsCount > 2)
     delete [] dividend;
 
   return lost_fraction;
@@ -1072,7 +1071,7 @@ APFloat::shiftSignificandLeft(unsigned int bits)
 {
   assert(bits < semantics->precision);
 
-  if(bits) {
+  if (bits) {
     unsigned int partsCount = partCount();
 
     APInt::tcShiftLeft(significandParts(), partsCount, bits);
@@ -1095,13 +1094,13 @@ APFloat::compareAbsoluteValue(const APFloat &rhs) const
 
   /* If exponents are equal, do an unsigned bignum comparison of the
      significands.  */
-  if(compare == 0)
+  if (compare == 0)
     compare = APInt::tcCompare(significandParts(), rhs.significandParts(),
                                partCount());
 
-  if(compare > 0)
+  if (compare > 0)
     return cmpGreaterThan;
-  else if(compare < 0)
+  else if (compare < 0)
     return cmpLessThan;
   else
     return cmpEqual;
@@ -1113,14 +1112,13 @@ APFloat::opStatus
 APFloat::handleOverflow(roundingMode rounding_mode)
 {
   /* Infinity?  */
-  if(rounding_mode == rmNearestTiesToEven
-     || rounding_mode == rmNearestTiesToAway
-     || (rounding_mode == rmTowardPositive && !sign)
-     || (rounding_mode == rmTowardNegative && sign))
-    {
-      category = fcInfinity;
-      return (opStatus) (opOverflow | opInexact);
-    }
+  if (rounding_mode == rmNearestTiesToEven ||
+      rounding_mode == rmNearestTiesToAway ||
+      (rounding_mode == rmTowardPositive && !sign) ||
+      (rounding_mode == rmTowardNegative && sign)) {
+    category = fcInfinity;
+    return (opStatus) (opOverflow | opInexact);
+  }
 
   /* Otherwise we become the largest finite number.  */
   category = fcNormal;
@@ -1155,11 +1153,11 @@ APFloat::roundAwayFromZero(roundingMode rounding_mode,
     return lost_fraction == lfExactlyHalf || lost_fraction == lfMoreThanHalf;
 
   case rmNearestTiesToEven:
-    if(lost_fraction == lfMoreThanHalf)
+    if (lost_fraction == lfMoreThanHalf)
       return true;
 
     /* Our zeroes don't have a significand to test.  */
-    if(lost_fraction == lfExactlyHalf && category != fcZero)
+    if (lost_fraction == lfExactlyHalf && category != fcZero)
       return APInt::tcExtractBit(significandParts(), bit);
 
     return false;
@@ -1182,13 +1180,13 @@ APFloat::normalize(roundingMode rounding_mode,
   unsigned int omsb;                /* One, not zero, based MSB.  */
   int exponentChange;
 
-  if(category != fcNormal)
+  if (category != fcNormal)
     return opOK;
 
   /* Before rounding normalize the exponent of fcNormal numbers.  */
   omsb = significandMSB() + 1;
 
-  if(omsb) {
+  if (omsb) {
     /* OMSB is numbered from 1.  We want to place it in the integer
        bit numbered PRECISON if possible, with a compensating change in
        the exponent.  */
@@ -1196,16 +1194,16 @@ APFloat::normalize(roundingMode rounding_mode,
 
     /* If the resulting exponent is too high, overflow according to
        the rounding mode.  */
-    if(exponent + exponentChange > semantics->maxExponent)
+    if (exponent + exponentChange > semantics->maxExponent)
       return handleOverflow(rounding_mode);
 
     /* Subnormal numbers have exponent minExponent, and their MSB
        is forced based on that.  */
-    if(exponent + exponentChange < semantics->minExponent)
+    if (exponent + exponentChange < semantics->minExponent)
       exponentChange = semantics->minExponent - exponent;
 
     /* Shifting left is easy as we don't lose precision.  */
-    if(exponentChange < 0) {
+    if (exponentChange < 0) {
       assert(lost_fraction == lfExactlyZero);
 
       shiftSignificandLeft(-exponentChange);
@@ -1213,7 +1211,7 @@ APFloat::normalize(roundingMode rounding_mode,
       return opOK;
     }
 
-    if(exponentChange > 0) {
+    if (exponentChange > 0) {
       lostFraction lf;
 
       /* Shift right and capture any new lost fraction.  */
@@ -1222,7 +1220,7 @@ APFloat::normalize(roundingMode rounding_mode,
       lost_fraction = combineLostFractions(lf, lost_fraction);
 
       /* Keep OMSB up-to-date.  */
-      if(omsb > (unsigned) exponentChange)
+      if (omsb > (unsigned) exponentChange)
         omsb -= exponentChange;
       else
         omsb = 0;
@@ -1234,28 +1232,28 @@ APFloat::normalize(roundingMode rounding_mode,
 
   /* As specified in IEEE 754, since we do not trap we do not report
      underflow for exact results.  */
-  if(lost_fraction == lfExactlyZero) {
+  if (lost_fraction == lfExactlyZero) {
     /* Canonicalize zeroes.  */
-    if(omsb == 0)
+    if (omsb == 0)
       category = fcZero;
 
     return opOK;
   }
 
   /* Increment the significand if we're rounding away from zero.  */
-  if(roundAwayFromZero(rounding_mode, lost_fraction, 0)) {
-    if(omsb == 0)
+  if (roundAwayFromZero(rounding_mode, lost_fraction, 0)) {
+    if (omsb == 0)
       exponent = semantics->minExponent;
 
     incrementSignificand();
     omsb = significandMSB() + 1;
 
     /* Did the significand increment overflow?  */
-    if(omsb == (unsigned) semantics->precision + 1) {
+    if (omsb == (unsigned) semantics->precision + 1) {
       /* Renormalize by incrementing the exponent and shifting our
          significand right one.  However if we already have the
          maximum exponent we overflow to infinity.  */
-      if(exponent == semantics->maxExponent) {
+      if (exponent == semantics->maxExponent) {
         category = fcInfinity;
 
         return (opStatus) (opOverflow | opInexact);
@@ -1269,14 +1267,14 @@ APFloat::normalize(roundingMode rounding_mode,
 
   /* The normal case - we were and are not denormal, and any
      significand increment above didn't overflow.  */
-  if(omsb == semantics->precision)
+  if (omsb == semantics->precision)
     return opInexact;
 
   /* We have a non-zero denormal.  */
   assert(omsb < semantics->precision);
 
   /* Canonicalize zeroes.  */
-  if(omsb == 0)
+  if (omsb == 0)
     category = fcZero;
 
   /* The fcZero case is a denormal that underflowed to zero.  */
@@ -1324,7 +1322,7 @@ APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
   case convolve(fcInfinity, fcInfinity):
     /* Differently signed infinities can only be validly
        subtracted.  */
-    if(((sign ^ rhs.sign)!=0) != subtract) {
+    if (((sign ^ rhs.sign)!=0) != subtract) {
       makeNaN();
       return opInvalidOp;
     }
@@ -1352,7 +1350,7 @@ APFloat::addOrSubtractSignificand(const APFloat &rhs, bool subtract)
   bits = exponent - rhs.exponent;
 
   /* Subtraction is more subtle than one might naively expect.  */
-  if(subtract) {
+  if (subtract) {
     APFloat temp_rhs(rhs);
     bool reverse;
 
@@ -1381,16 +1379,16 @@ APFloat::addOrSubtractSignificand(const APFloat &rhs, bool subtract)
 
     /* Invert the lost fraction - it was on the RHS and
        subtracted.  */
-    if(lost_fraction == lfLessThanHalf)
+    if (lost_fraction == lfLessThanHalf)
       lost_fraction = lfMoreThanHalf;
-    else if(lost_fraction == lfMoreThanHalf)
+    else if (lost_fraction == lfMoreThanHalf)
       lost_fraction = lfLessThanHalf;
 
     /* The code above is intended to ensure that no borrow is
        necessary.  */
     assert(!carry);
   } else {
-    if(bits > 0) {
+    if (bits > 0) {
       APFloat temp_rhs(rhs);
 
       lost_fraction = temp_rhs.shiftSignificandRight(bits);
@@ -1561,7 +1559,7 @@ APFloat::addOrSubtract(const APFloat &rhs, roundingMode rounding_mode,
   fs = addOrSubtractSpecials(rhs, subtract);
 
   /* This return code means it was not a simple case.  */
-  if(fs == opDivByZero) {
+  if (fs == opDivByZero) {
     lostFraction lost_fraction;
 
     lost_fraction = addOrSubtractSignificand(rhs, subtract);
@@ -1574,8 +1572,8 @@ APFloat::addOrSubtract(const APFloat &rhs, roundingMode rounding_mode,
   /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
      positive zero unless rounding to minus infinity, except that
      adding two like-signed zeroes gives that zero.  */
-  if(category == fcZero) {
-    if(rhs.category != fcZero || (sign == rhs.sign) == subtract)
+  if (category == fcZero) {
+    if (rhs.category != fcZero || (sign == rhs.sign) == subtract)
       sign = (rounding_mode == rmTowardNegative);
   }
 
@@ -1606,10 +1604,10 @@ APFloat::multiply(const APFloat &rhs, roundingMode rounding_mode)
   sign ^= rhs.sign;
   fs = multiplySpecials(rhs);
 
-  if(category == fcNormal) {
+  if (category == fcNormal) {
     lostFraction lost_fraction = multiplySignificand(rhs, 0);
     fs = normalize(rounding_mode, lost_fraction);
-    if(lost_fraction != lfExactlyZero)
+    if (lost_fraction != lfExactlyZero)
       fs = (opStatus) (fs | opInexact);
   }
 
@@ -1626,10 +1624,10 @@ APFloat::divide(const APFloat &rhs, roundingMode rounding_mode)
   sign ^= rhs.sign;
   fs = divideSpecials(rhs);
 
-  if(category == fcNormal) {
+  if (category == fcNormal) {
     lostFraction lost_fraction = divideSignificand(rhs);
     fs = normalize(rounding_mode, lost_fraction);
-    if(lost_fraction != lfExactlyZero)
+    if (lost_fraction != lfExactlyZero)
       fs = (opStatus) (fs | opInexact);
   }
 
@@ -1673,7 +1671,7 @@ APFloat::remainder(const APFloat &rhs)
   return fs;
 }
 
-/* Normalized llvm frem (C fmod).  
+/* Normalized llvm frem (C fmod).
    This is not currently correct in all cases.  */
 APFloat::opStatus
 APFloat::mod(const APFloat &rhs, roundingMode rounding_mode)
@@ -1730,20 +1728,20 @@ APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
 
   /* If and only if all arguments are normal do we need to do an
      extended-precision calculation.  */
-  if(category == fcNormal
-     && multiplicand.category == fcNormal
-     && addend.category == fcNormal) {
+  if (category == fcNormal &&
+      multiplicand.category == fcNormal &&
+      addend.category == fcNormal) {
     lostFraction lost_fraction;
 
     lost_fraction = multiplySignificand(multiplicand, &addend);
     fs = normalize(rounding_mode, lost_fraction);
-    if(lost_fraction != lfExactlyZero)
+    if (lost_fraction != lfExactlyZero)
       fs = (opStatus) (fs | opInexact);
 
     /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
        positive zero unless rounding to minus infinity, except that
        adding two like-signed zeroes gives that zero.  */
-    if(category == fcZero && sign != addend.sign)
+    if (category == fcZero && sign != addend.sign)
       sign = (rounding_mode == rmTowardNegative);
   } else {
     fs = multiplySpecials(multiplicand);
@@ -1755,7 +1753,7 @@ APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
 
        If we need to do the addition we can do so with normal
        precision.  */
-    if(fs == opOK)
+    if (fs == opOK)
       fs = addOrSubtract(addend, rounding_mode, false);
   }
 
@@ -1787,7 +1785,7 @@ APFloat::compare(const APFloat &rhs) const
   case convolve(fcInfinity, fcNormal):
   case convolve(fcInfinity, fcZero):
   case convolve(fcNormal, fcZero):
-    if(sign)
+    if (sign)
       return cmpLessThan;
     else
       return cmpGreaterThan;
@@ -1795,15 +1793,15 @@ APFloat::compare(const APFloat &rhs) const
   case convolve(fcNormal, fcInfinity):
   case convolve(fcZero, fcInfinity):
   case convolve(fcZero, fcNormal):
-    if(rhs.sign)
+    if (rhs.sign)
       return cmpGreaterThan;
     else
       return cmpLessThan;
 
   case convolve(fcInfinity, fcInfinity):
-    if(sign == rhs.sign)
+    if (sign == rhs.sign)
       return cmpEqual;
-    else if(sign)
+    else if (sign)
       return cmpLessThan;
     else
       return cmpGreaterThan;
@@ -1816,8 +1814,8 @@ APFloat::compare(const APFloat &rhs) const
   }
 
   /* Two normal numbers.  Do they have the same sign?  */
-  if(sign != rhs.sign) {
-    if(sign)
+  if (sign != rhs.sign) {
+    if (sign)
       result = cmpLessThan;
     else
       result = cmpGreaterThan;
@@ -1825,10 +1823,10 @@ APFloat::compare(const APFloat &rhs) const
     /* Compare absolute values; invert result if negative.  */
     result = compareAbsoluteValue(rhs);
 
-    if(sign) {
-      if(result == cmpLessThan)
+    if (sign) {
+      if (result == cmpLessThan)
         result = cmpGreaterThan;
-      else if(result == cmpGreaterThan)
+      else if (result == cmpGreaterThan)
         result = cmpLessThan;
     }
   }
@@ -1886,7 +1884,7 @@ APFloat::convert(const fltSemantics &toSemantics,
     }
   }
 
-  if(category == fcNormal) {
+  if (category == fcNormal) {
     /* Re-interpret our bit-pattern.  */
     exponent += toSemantics.precision - semantics->precision;
     semantics = &toSemantics;
@@ -1911,7 +1909,7 @@ APFloat::convert(const fltSemantics &toSemantics,
       // x87 long double).
       if (APInt::tcLSB(significandParts(), newPartCount) < ushift)
         *losesInfo = true;
-      if (oldSemantics == &APFloat::x87DoubleExtended && 
+      if (oldSemantics == &APFloat::x87DoubleExtended &&
           (!(*significandParts() & 0x8000000000000000ULL) ||
            !(*significandParts() & 0x4000000000000000ULL)))
         *losesInfo = true;
@@ -1956,12 +1954,12 @@ APFloat::convertToSignExtendedInteger(integerPart *parts, unsigned int width,
   *isExact = false;
 
   /* Handle the three special cases first.  */
-  if(category == fcInfinity || category == fcNaN)
+  if (category == fcInfinity || category == fcNaN)
     return opInvalidOp;
 
   dstPartsCount = partCountForBits(width);
 
-  if(category == fcZero) {
+  if (category == fcZero) {
     APInt::tcSet(parts, 0, dstPartsCount);
     // Negative zero can't be represented as an int.
     *isExact = !sign;
@@ -2004,8 +2002,8 @@ APFloat::convertToSignExtendedInteger(integerPart *parts, unsigned int width,
   if (truncatedBits) {
     lost_fraction = lostFractionThroughTruncation(src, partCount(),
                                                   truncatedBits);
-    if (lost_fraction != lfExactlyZero
-        && roundAwayFromZero(rounding_mode, lost_fraction, truncatedBits)) {
+    if (lost_fraction != lfExactlyZero &&
+        roundAwayFromZero(rounding_mode, lost_fraction, truncatedBits)) {
       if (APInt::tcIncrement(parts, dstPartsCount))
         return opInvalidOp;     /* Overflow.  */
     }
@@ -2062,7 +2060,7 @@ APFloat::convertToInteger(integerPart *parts, unsigned int width,
 {
   opStatus fs;
 
-  fs = convertToSignExtendedInteger(parts, width, isSigned, rounding_mode, 
+  fs = convertToSignExtendedInteger(parts, width, isSigned, rounding_mode,
                                     isExact);
 
   if (fs == opInvalidOp) {
@@ -2149,8 +2147,8 @@ APFloat::convertFromSignExtendedInteger(const integerPart *src,
   opStatus status;
 
   assertArithmeticOK(*semantics);
-  if (isSigned
-      && APInt::tcExtractBit(src, srcCount * integerPartWidth - 1)) {
+  if (isSigned &&
+      APInt::tcExtractBit(src, srcCount * integerPartWidth - 1)) {
     integerPart *copy;
 
     /* If we're signed and negative negate a copy.  */
@@ -2178,7 +2176,7 @@ APFloat::convertFromZeroExtendedInteger(const integerPart *parts,
   APInt api = APInt(width, partCount, parts);
 
   sign = false;
-  if(isSigned && APInt::tcExtractBit(parts, width - 1)) {
+  if (isSigned && APInt::tcExtractBit(parts, width - 1)) {
     sign = true;
     api = -api;
   }
@@ -2209,10 +2207,10 @@ APFloat::convertFromHexadecimalString(const StringRef &s,
   StringRef::iterator p = skipLeadingZeroesAndAnyDot(begin, end, &dot);
   firstSignificantDigit = p;
 
-  for(; p != end;) {
+  for (; p != end;) {
     integerPart hex_value;
 
-    if(*p == '.') {
+    if (*p == '.') {
       assert(dot == end && "String contains multiple dots");
       dot = p++;
       if (p == end) {
@@ -2221,7 +2219,7 @@ APFloat::convertFromHexadecimalString(const StringRef &s,
     }
 
     hex_value = hexDigitValue(*p);
-    if(hex_value == -1U) {
+    if (hex_value == -1U) {
       break;
     }
 
@@ -2231,13 +2229,13 @@ APFloat::convertFromHexadecimalString(const StringRef &s,
       break;
     } else {
       /* Store the number whilst 4-bit nibbles remain.  */
-      if(bitPos) {
+      if (bitPos) {
         bitPos -= 4;
         hex_value <<= bitPos % integerPartWidth;
         significand[bitPos / integerPartWidth] |= hex_value;
       } else {
         lost_fraction = trailingHexadecimalFraction(p, end, hex_value);
-        while(p != end && hexDigitValue(*p) != -1U)
+        while (p != end && hexDigitValue(*p) != -1U)
           p++;
         break;
       }
@@ -2251,7 +2249,7 @@ APFloat::convertFromHexadecimalString(const StringRef &s,
   assert((dot == end || p - begin != 1) && "Significand has no digits");
 
   /* Ignore the exponent if we are zero.  */
-  if(p != firstSignificantDigit) {
+  if (p != firstSignificantDigit) {
     int expAdjustment;
 
     /* Implicit hexadecimal point?  */
@@ -2261,7 +2259,7 @@ APFloat::convertFromHexadecimalString(const StringRef &s,
     /* Calculate the exponent adjustment implicit in the number of
        significant digits.  */
     expAdjustment = static_cast<int>(dot - firstSignificantDigit);
-    if(expAdjustment < 0)
+    if (expAdjustment < 0)
       expAdjustment++;
     expAdjustment = expAdjustment * 4 - 1;
 
@@ -2287,8 +2285,8 @@ APFloat::roundSignificandWithExponent(const integerPart *decSigParts,
   integerPart pow5Parts[maxPowerOfFiveParts];
   bool isNearest;
 
-  isNearest = (rounding_mode == rmNearestTiesToEven
-               || rounding_mode == rmNearestTiesToAway);
+  isNearest = (rounding_mode == rmNearestTiesToEven ||
+               rounding_mode == rmNearestTiesToAway);
 
   parts = partCountForBits(semantics->precision + 11);
 
@@ -2482,13 +2480,13 @@ APFloat::convertFromString(const StringRef &str, roundingMode rounding_mode)
   StringRef::iterator p = str.begin();
   size_t slen = str.size();
   sign = *p == '-' ? 1 : 0;
-  if(*p == '-' || *p == '+') {
+  if (*p == '-' || *p == '+') {
     p++;
     slen--;
     assert(slen && "String has no digits");
   }
 
-  if(slen >= 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
+  if (slen >= 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
     assert(slen - 2 && "Invalid string");
     return convertFromHexadecimalString(StringRef(p + 2, slen - 2),
                                         rounding_mode);
@@ -3013,7 +3011,7 @@ APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
     // exponent2 and significand2 are required to be 0; we don't check
     category = fcInfinity;
   } else if (myexponent==0x7ff && mysignificand!=0) {
-    // exponent meaningless.  So is the whole second word, but keep it 
+    // exponent meaningless.  So is the whole second word, but keep it
     // for determinism.
     category = fcNaN;
     exponent2 = myexponent2;
@@ -3031,7 +3029,7 @@ APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
       exponent = -1022;
     else
       significandParts()[0] |= 0x10000000000000LL;  // integer bit
-    if (myexponent2==0) 
+    if (myexponent2==0)
       exponent2 = -1022;
     else
       significandParts()[1] |= 0x10000000000000LL;  // integer bit
@@ -3217,8 +3215,8 @@ APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) {
     significand[i] = ~((integerPart) 0);
 
   // ...and then clear the top bits for internal consistency.
-  significand[N-1]
-    &= (((integerPart) 1) << ((Sem.precision % integerPartWidth) - 1)) - 1;
+  significand[N-1] &=
+    (((integerPart) 1) << ((Sem.precision % integerPartWidth) - 1)) - 1;
 
   return Val;
 }
@@ -3247,8 +3245,8 @@ APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) {
 
   Val.exponent = Sem.minExponent;
   Val.zeroSignificand();
-  Val.significandParts()[partCountForBits(Sem.precision)-1]
-    |= (((integerPart) 1) << ((Sem.precision % integerPartWidth) - 1));
+  Val.significandParts()[partCountForBits(Sem.precision)-1] |=
+    (((integerPart) 1) << ((Sem.precision % integerPartWidth) - 1));
 
   return Val;
 }
@@ -3433,7 +3431,7 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
     //   log2(N * 5^e) == log2(N) + e * log2(5)
     //                 <= semantics->precision + e * 137 / 59
     //   (log_2(5) ~ 2.321928 < 2.322034 ~ 137/59)
-    
+
     unsigned precision = semantics->precision + 137 * texp / 59;
 
     // Multiply significand by 5^e.
@@ -3442,7 +3440,7 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
     APInt five_to_the_i(precision, 5);
     while (true) {
       if (texp & 1) significand *= five_to_the_i;
-      
+
       texp >>= 1;
       if (!texp) break;
       five_to_the_i *= five_to_the_i;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 6a6384aa3f43..50025d214bcb 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -702,15 +702,14 @@ static inline uint32_t hashword(const uint64_t *k64, size_t length)
   a = b = c = 0xdeadbeef + (((uint32_t)length)<<2);
 
   /*------------------------------------------------- handle most of the key */
-  while (length > 3)
-    {
-      a += k[0];
-      b += k[1];
-      c += k[2];
-      mix(a,b,c);
-      length -= 3;
-      k += 3;
-    }
+  while (length > 3) {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
 
   /*------------------------------------------- handle the last 3 uint32_t's */
   switch (length) {                  /* all the case statements fall through */
@@ -1383,8 +1382,8 @@ APInt APInt::sqrt() const {
   // libc sqrt function which will probably use a hardware sqrt computation.
   // This should be faster than the algorithm below.
   if (magnitude < 52) {
-#ifdef _MSC_VER
-    // Amazingly, VC++ doesn't have round().
+#if defined( _MSC_VER ) || defined(_MINIX)
+    // Amazingly, VC++ and Minix don't have round().
     return APInt(BitWidth,
                  uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0]))) + 0.5);
 #else
@@ -2065,8 +2064,8 @@ void APInt::fromString(unsigned numbits, const StringRef& str, uint8_t radix) {
   assert((slen <= numbits || radix != 2) && "Insufficient bit width");
   assert(((slen-1)*3 <= numbits || radix != 8) && "Insufficient bit width");
   assert(((slen-1)*4 <= numbits || radix != 16) && "Insufficient bit width");
-  assert((((slen-1)*64)/22 <= numbits || radix != 10)
-         && "Insufficient bit width");
+  assert((((slen-1)*64)/22 <= numbits || radix != 10) &&
+         "Insufficient bit width");
 
   // Allocate memory
   if (!isSingleWord())
@@ -2229,7 +2228,7 @@ namespace {
   static inline integerPart
   lowBitMask(unsigned int bits)
   {
-    assert (bits != 0 && bits <= integerPartWidth);
+    assert(bits != 0 && bits <= integerPartWidth);
 
     return ~(integerPart) 0 >> (integerPartWidth - bits);
   }
@@ -2306,10 +2305,10 @@ APInt::tcSet(integerPart *dst, integerPart part, unsigned int parts)
 {
   unsigned int i;
 
-  assert (parts > 0);
+  assert(parts > 0);
 
   dst[0] = part;
-  for(i = 1; i < parts; i++)
+  for (i = 1; i < parts; i++)
     dst[i] = 0;
 }
 
@@ -2319,7 +2318,7 @@ APInt::tcAssign(integerPart *dst, const integerPart *src, unsigned int parts)
 {
   unsigned int i;
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     dst[i] = src[i];
 }
 
@@ -2329,7 +2328,7 @@ APInt::tcIsZero(const integerPart *src, unsigned int parts)
 {
   unsigned int i;
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     if (src[i])
       return false;
 
@@ -2340,8 +2339,8 @@ APInt::tcIsZero(const integerPart *src, unsigned int parts)
 int
 APInt::tcExtractBit(const integerPart *parts, unsigned int bit)
 {
-  return(parts[bit / integerPartWidth]
-         & ((integerPart) 1 << bit % integerPartWidth)) != 0;
+  return (parts[bit / integerPartWidth] &
+          ((integerPart) 1 << bit % integerPartWidth)) != 0;
 }
 
 /* Set the given bit of a bignum. */
@@ -2366,7 +2365,7 @@ APInt::tcLSB(const integerPart *parts, unsigned int n)
 {
   unsigned int i, lsb;
 
-  for(i = 0; i < n; i++) {
+  for (i = 0; i < n; i++) {
       if (parts[i] != 0) {
           lsb = partLSB(parts[i]);
 
@@ -2385,13 +2384,13 @@ APInt::tcMSB(const integerPart *parts, unsigned int n)
   unsigned int msb;
 
   do {
-      --n;
+    --n;
 
-      if (parts[n] != 0) {
-          msb = partMSB(parts[n]);
+    if (parts[n] != 0) {
+      msb = partMSB(parts[n]);
 
-          return msb + n * integerPartWidth;
-      }
+      return msb + n * integerPartWidth;
+    }
   } while (n);
 
   return -1U;
@@ -2408,7 +2407,7 @@ APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
   unsigned int firstSrcPart, dstParts, shift, n;
 
   dstParts = (srcBits + integerPartWidth - 1) / integerPartWidth;
-  assert (dstParts <= dstCount);
+  assert(dstParts <= dstCount);
 
   firstSrcPart = srcLSB / integerPartWidth;
   tcAssign (dst, src + firstSrcPart, dstParts);
@@ -2443,7 +2442,7 @@ APInt::tcAdd(integerPart *dst, const integerPart *rhs,
 
   assert(c <= 1);
 
-  for(i = 0; i < parts; i++) {
+  for (i = 0; i < parts; i++) {
     integerPart l;
 
     l = dst[i];
@@ -2468,7 +2467,7 @@ APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
 
   assert(c <= 1);
 
-  for(i = 0; i < parts; i++) {
+  for (i = 0; i < parts; i++) {
     integerPart l;
 
     l = dst[i];
@@ -2518,7 +2517,7 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
   /* N loops; minimum of dstParts and srcParts.  */
   n = dstParts < srcParts ? dstParts: srcParts;
 
-  for(i = 0; i < n; i++) {
+  for (i = 0; i < n; i++) {
     integerPart low, mid, high, srcPart;
 
       /* [ LOW, HIGH ] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
@@ -2583,7 +2582,7 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
        non-zero.  This is true if any remaining src parts are non-zero
        and the multiplier is non-zero.  */
     if (multiplier)
-      for(; i < srcParts; i++)
+      for (; i < srcParts; i++)
         if (src[i])
           return 1;
 
@@ -2608,7 +2607,7 @@ APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
   overflow = 0;
   tcSet(dst, 0, parts);
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     overflow |= tcMultiplyPart(&dst[i], lhs, rhs[i], 0, parts,
                                parts - i, true);
 
@@ -2634,7 +2633,7 @@ APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
 
     tcSet(dst, 0, rhsParts);
 
-    for(n = 0; n < lhsParts; n++)
+    for (n = 0; n < lhsParts; n++)
       tcMultiplyPart(&dst[n], rhs, lhs[n], 0, rhsParts, rhsParts + 1, true);
 
     n = lhsParts + rhsParts;
@@ -2678,7 +2677,7 @@ APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
 
   /* Loop, subtracting SRHS if REMAINDER is greater and adding that to
      the total.  */
-  for(;;) {
+  for (;;) {
       int compare;
 
       compare = tcCompare(remainder, srhs, parts);
@@ -2746,7 +2745,7 @@ APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
 
     /* Perform the shift.  This leaves the most significant COUNT bits
        of the result at zero.  */
-    for(i = 0; i < parts; i++) {
+    for (i = 0; i < parts; i++) {
       integerPart part;
 
       if (i + jump >= parts) {
@@ -2771,7 +2770,7 @@ APInt::tcAnd(integerPart *dst, const integerPart *rhs, unsigned int parts)
 {
   unsigned int i;
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     dst[i] &= rhs[i];
 }
 
@@ -2781,7 +2780,7 @@ APInt::tcOr(integerPart *dst, const integerPart *rhs, unsigned int parts)
 {
   unsigned int i;
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     dst[i] |= rhs[i];
 }
 
@@ -2791,7 +2790,7 @@ APInt::tcXor(integerPart *dst, const integerPart *rhs, unsigned int parts)
 {
   unsigned int i;
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     dst[i] ^= rhs[i];
 }
 
@@ -2801,7 +2800,7 @@ APInt::tcComplement(integerPart *dst, unsigned int parts)
 {
   unsigned int i;
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     dst[i] = ~dst[i];
 }
 
@@ -2830,7 +2829,7 @@ APInt::tcIncrement(integerPart *dst, unsigned int parts)
 {
   unsigned int i;
 
-  for(i = 0; i < parts; i++)
+  for (i = 0; i < parts; i++)
     if (++dst[i] != 0)
       break;
 
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 2ab4103de2ec..d31f34ed6618 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -676,8 +676,8 @@ void cl::ParseCommandLineOptions(int argc, char **argv,
          << " positional arguments: See: " << argv[0] << " -help\n";
 
     ErrorParsing = true;
-  } else if (!HasUnlimitedPositionals
-             && PositionalVals.size() > PositionalOpts.size()) {
+  } else if (!HasUnlimitedPositionals &&
+             PositionalVals.size() > PositionalOpts.size()) {
     errs() << ProgramName
          << ": Too many positional arguments specified!\n"
          << "Can specify at most " << PositionalOpts.size()
diff --git a/lib/Support/Debug.cpp b/lib/Support/Debug.cpp
index 82b4b8ce1636..eccfa0bd0725 100644
--- a/lib/Support/Debug.cpp
+++ b/lib/Support/Debug.cpp
@@ -64,8 +64,7 @@ DebugOnly("debug-only", cl::desc("Enable a specific type of debug output"),
           cl::location(DebugOnlyOptLoc), cl::ValueRequired);
 
 // Signal handlers - dump debug output on termination.
-static void debug_user_sig_handler(void *Cookie)
-{
+static void debug_user_sig_handler(void *Cookie) {
   // This is a bit sneaky.  Since this is under #ifndef NDEBUG, we
   // know that debug mode is enabled and dbgs() really is a
   // circular_raw_ostream.  If NDEBUG is defined, then dbgs() ==
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 8bb156653a74..4412cb2dd0d4 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines an API for error handling, it supersedes cerr+abort(), and 
+// This file defines an API for error handling, it supersedes cerr+abort(), and
 // cerr+exit() style error handling.
 // Callbacks can be registered for these errors through this API.
 //===----------------------------------------------------------------------===//
@@ -57,7 +57,7 @@ void llvm_report_error(const Twine &reason) {
   exit(1);
 }
 
-void llvm_unreachable_internal(const char *msg, const char *file, 
+void llvm_unreachable_internal(const char *msg, const char *file,
                                unsigned line) {
   // This code intentionally doesn't call the ErrorHandler callback, because
   // llvm_unreachable is intended to be used to indicate "impossible"
@@ -71,4 +71,3 @@ void llvm_unreachable_internal(const char *msg, const char *file,
   abort();
 }
 }
-
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 345a78c190c0..4f135ead182c 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/System/Errno.h"
 #include "llvm/System/Path.h"
 #include "llvm/System/Process.h"
 #include "llvm/System/Program.h"
@@ -167,6 +168,14 @@ public:
     sys::Path::UnMapFilePages(getBufferStart(), getBufferSize());
   }
 };
+
+/// FileCloser - RAII object to make sure an FD gets closed properly.
+class FileCloser {
+  int FD;
+public:
+  FileCloser(int FD) : FD(FD) {}
+  ~FileCloser() { ::close(FD); }
+};
 }
 
 MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
@@ -178,9 +187,10 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
   SmallString<256> PathBuf(Filename.begin(), Filename.end());
   int FD = ::open(PathBuf.c_str(), O_RDONLY|OpenFlags);
   if (FD == -1) {
-    if (ErrStr) *ErrStr = strerror(errno);
+    if (ErrStr) *ErrStr = sys::StrError();
     return 0;
   }
+  FileCloser FC(FD); // Close FD on return.
   
   // If we don't know the file size, use fstat to find out.  fstat on an open
   // file descriptor is cheaper than stat on a random path.
@@ -190,8 +200,7 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
     
     // TODO: This should use fstat64 when available.
     if (fstat(FD, FileInfoPtr) == -1) {
-      if (ErrStr) *ErrStr = strerror(errno);
-      ::close(FD);
+      if (ErrStr) *ErrStr = sys::StrError();
       return 0;
     }
     FileSize = FileInfoPtr->st_size;
@@ -208,7 +217,6 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
       (FileSize & (sys::Process::GetPageSize()-1)) != 0) {
     if (const char *Pages = sys::Path::MapInFilePages(FD, FileSize)) {
       // Close the file descriptor, now that the whole file is in memory.
-      ::close(FD);
       return new MemoryBufferMMapFile(Filename, Pages, FileSize);
     }
   }
@@ -217,30 +225,31 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
   if (!Buf) {
     // Failed to create a buffer.
     if (ErrStr) *ErrStr = "could not allocate buffer";
-    ::close(FD);
     return 0;
   }
 
   OwningPtr<MemoryBuffer> SB(Buf);
   char *BufPtr = const_cast<char*>(SB->getBufferStart());
-  
+
   size_t BytesLeft = FileSize;
   while (BytesLeft) {
     ssize_t NumRead = ::read(FD, BufPtr, BytesLeft);
-    if (NumRead > 0) {
-      BytesLeft -= NumRead;
-      BufPtr += NumRead;
-    } else if (NumRead == -1 && errno == EINTR) {
-      // try again
-    } else {
-      // error reading.
-      if (ErrStr) *ErrStr = strerror(errno);
-      close(FD);
+    if (NumRead == -1) {
+      if (errno == EINTR)
+        continue;
+      // Error while reading.
+      if (ErrStr) *ErrStr = sys::StrError();
       return 0;
+    } else if (NumRead == 0) {
+      // We hit EOF early, truncate and terminate buffer.
+      Buf->BufferEnd = BufPtr;
+      *BufPtr = 0;
+      return SB.take();
     }
+    BytesLeft -= NumRead;
+    BufPtr += NumRead;
   }
-  close(FD);
-  
+
   return SB.take();
 }
 
diff --git a/lib/Support/Statistic.cpp b/lib/Support/Statistic.cpp
index e78767045998..7d5f65af2842 100644
--- a/lib/Support/Statistic.cpp
+++ b/lib/Support/Statistic.cpp
@@ -32,8 +32,8 @@
 #include <cstring>
 using namespace llvm;
 
-// GetLibSupportInfoOutputFile - Return a file stream to print our output on.
-namespace llvm { extern raw_ostream *GetLibSupportInfoOutputFile(); }
+// CreateInfoOutputFile - Return a file stream to print our output on.
+namespace llvm { extern raw_ostream *CreateInfoOutputFile(); }
 
 /// -stats - Command line option to cause transformations to emit stats about
 /// what they did.
@@ -48,6 +48,8 @@ namespace {
 /// llvm_shutdown is called.  We print statistics from the destructor.
 class StatisticInfo {
   std::vector<const Statistic*> Stats;
+  friend void llvm::PrintStatistics();
+  friend void llvm::PrintStatistics(raw_ostream &OS);
 public:
   ~StatisticInfo();
   
@@ -92,42 +94,55 @@ struct NameCompare {
 
 // Print information when destroyed, iff command line option is specified.
 StatisticInfo::~StatisticInfo() {
-  // Statistics not enabled?
-  if (Stats.empty()) return;
+  llvm::PrintStatistics();
+}
 
-  // Get the stream to write to.
-  raw_ostream &OutStream = *GetLibSupportInfoOutputFile();
+void llvm::EnableStatistics() {
+  Enabled.setValue(true);
+}
+
+void llvm::PrintStatistics(raw_ostream &OS) {
+  StatisticInfo &Stats = *StatInfo;
 
   // Figure out how long the biggest Value and Name fields are.
   unsigned MaxNameLen = 0, MaxValLen = 0;
-  for (size_t i = 0, e = Stats.size(); i != e; ++i) {
+  for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) {
     MaxValLen = std::max(MaxValLen,
-                         (unsigned)utostr(Stats[i]->getValue()).size());
+                         (unsigned)utostr(Stats.Stats[i]->getValue()).size());
     MaxNameLen = std::max(MaxNameLen,
-                          (unsigned)std::strlen(Stats[i]->getName()));
+                          (unsigned)std::strlen(Stats.Stats[i]->getName()));
   }
   
   // Sort the fields by name.
-  std::stable_sort(Stats.begin(), Stats.end(), NameCompare());
+  std::stable_sort(Stats.Stats.begin(), Stats.Stats.end(), NameCompare());
 
   // Print out the statistics header...
-  OutStream << "===" << std::string(73, '-') << "===\n"
-            << "                          ... Statistics Collected ...\n"
-            << "===" << std::string(73, '-') << "===\n\n";
+  OS << "===" << std::string(73, '-') << "===\n"
+     << "                          ... Statistics Collected ...\n"
+     << "===" << std::string(73, '-') << "===\n\n";
   
   // Print all of the statistics.
-  for (size_t i = 0, e = Stats.size(); i != e; ++i) {
-    std::string CountStr = utostr(Stats[i]->getValue());
-    OutStream << std::string(MaxValLen-CountStr.size(), ' ')
-              << CountStr << " " << Stats[i]->getName()
-              << std::string(MaxNameLen-std::strlen(Stats[i]->getName()), ' ')
-              << " - " << Stats[i]->getDesc() << "\n";
-    
+  for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) {
+    std::string CountStr = utostr(Stats.Stats[i]->getValue());
+    OS << std::string(MaxValLen-CountStr.size(), ' ')
+       << CountStr << " " << Stats.Stats[i]->getName()
+       << std::string(MaxNameLen-std::strlen(Stats.Stats[i]->getName()), ' ')
+       << " - " << Stats.Stats[i]->getDesc() << "\n";
   }
   
-  OutStream << '\n';  // Flush the output stream...
-  OutStream.flush();
-  
-  if (&OutStream != &outs() && &OutStream != &errs() && &OutStream != &dbgs())
-    delete &OutStream;   // Close the file.
+  OS << '\n';  // Flush the output stream.
+  OS.flush();
+
+}
+
+void llvm::PrintStatistics() {
+  StatisticInfo &Stats = *StatInfo;
+
+  // Statistics not enabled?
+  if (Stats.Stats.empty()) return;
+
+  // Get the stream to write to.
+  raw_ostream &OutStream = *CreateInfoOutputFile();
+  PrintStatistics(OutStream);
+  delete &OutStream;   // Close the file.
 }
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 4bdfac298cc0..4fac0737f0c0 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -11,20 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Format.h"
+#include "llvm/System/Mutex.h"
 #include "llvm/System/Process.h"
-#include <algorithm>
-#include <functional>
-#include <map>
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
 using namespace llvm;
 
-// GetLibSupportInfoOutputFile - Return a file stream to print our output on.
-namespace llvm { extern raw_ostream *GetLibSupportInfoOutputFile(); }
+// CreateInfoOutputFile - Return a file stream to print our output on.
+namespace llvm { extern raw_ostream *CreateInfoOutputFile(); }
 
 // getLibSupportInfoOutputFilename - This ugly hack is brought to you courtesy
 // of constructor/destructor ordering being unspecified by C++.  Basically the
@@ -53,117 +53,103 @@ namespace {
                    cl::Hidden, cl::location(getLibSupportInfoOutputFilename()));
 }
 
+// CreateInfoOutputFile - Return a file stream to print our output on.
+raw_ostream *llvm::CreateInfoOutputFile() {
+  const std::string &OutputFilename = getLibSupportInfoOutputFilename();
+  if (OutputFilename.empty())
+    return new raw_fd_ostream(2, false); // stderr.
+  if (OutputFilename == "-")
+    return new raw_fd_ostream(1, false); // stdout.
+  
+  std::string Error;
+  raw_ostream *Result = new raw_fd_ostream(OutputFilename.c_str(),
+                                           Error, raw_fd_ostream::F_Append);
+  if (Error.empty())
+    return Result;
+  
+  errs() << "Error opening info-output-file '"
+    << OutputFilename << " for appending!\n";
+  delete Result;
+  return new raw_fd_ostream(2, false); // stderr.
+}
+
+
 static TimerGroup *DefaultTimerGroup = 0;
 static TimerGroup *getDefaultTimerGroup() {
-  TimerGroup* tmp = DefaultTimerGroup;
+  TimerGroup *tmp = DefaultTimerGroup;
   sys::MemoryFence();
+  if (tmp) return tmp;
+  
+  llvm_acquire_global_lock();
+  tmp = DefaultTimerGroup;
   if (!tmp) {
-    llvm_acquire_global_lock();
-    tmp = DefaultTimerGroup;
-    if (!tmp) {
-      tmp = new TimerGroup("Miscellaneous Ungrouped Timers");
-      sys::MemoryFence();
-      DefaultTimerGroup = tmp;
-    }
-    llvm_release_global_lock();
+    tmp = new TimerGroup("Miscellaneous Ungrouped Timers");
+    sys::MemoryFence();
+    DefaultTimerGroup = tmp;
   }
+  llvm_release_global_lock();
 
   return tmp;
 }
 
-Timer::Timer(const std::string &N)
-  : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
-    Started(false), TG(getDefaultTimerGroup()) {
-  TG->addTimer();
+//===----------------------------------------------------------------------===//
+// Timer Implementation
+//===----------------------------------------------------------------------===//
+
+void Timer::init(StringRef N) {
+  assert(TG == 0 && "Timer already initialized");
+  Name.assign(N.begin(), N.end());
+  Started = false;
+  TG = getDefaultTimerGroup();
+  TG->addTimer(*this);
 }
 
-Timer::Timer(const std::string &N, TimerGroup &tg)
-  : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
-    Started(false), TG(&tg) {
-  TG->addTimer();
+void Timer::init(StringRef N, TimerGroup &tg) {
+  assert(TG == 0 && "Timer already initialized");
+  Name.assign(N.begin(), N.end());
+  Started = false;
+  TG = &tg;
+  TG->addTimer(*this);
 }
 
-Timer::Timer(const Timer &T) {
-  TG = T.TG;
-  if (TG) TG->addTimer();
-  operator=(T);
-}
-
-
-// Copy ctor, initialize with no TG member.
-Timer::Timer(bool, const Timer &T) {
-  TG = T.TG;     // Avoid assertion in operator=
-  operator=(T);  // Copy contents
-  TG = 0;
-}
-
-
 Timer::~Timer() {
-  if (TG) {
-    if (Started) {
-      Started = false;
-      TG->addTimerToPrint(*this);
-    }
-    TG->removeTimer();
-  }
+  if (!TG) return;  // Never initialized, or already cleared.
+  TG->removeTimer(*this);
 }
 
 static inline size_t getMemUsage() {
-  if (TrackSpace)
-    return sys::Process::GetMallocUsage();
-  return 0;
+  if (!TrackSpace) return 0;
+  return sys::Process::GetMallocUsage();
 }
 
-struct TimeRecord {
-  double Elapsed, UserTime, SystemTime;
-  ssize_t MemUsed;
-};
-
-static TimeRecord getTimeRecord(bool Start) {
+TimeRecord TimeRecord::getCurrentTime(bool Start) {
   TimeRecord Result;
-
-  sys::TimeValue now(0,0);
-  sys::TimeValue user(0,0);
-  sys::TimeValue sys(0,0);
-
-  ssize_t MemUsed = 0;
+  sys::TimeValue now(0,0), user(0,0), sys(0,0);
+  
   if (Start) {
-    MemUsed = getMemUsage();
-    sys::Process::GetTimeUsage(now,user,sys);
+    Result.MemUsed = getMemUsage();
+    sys::Process::GetTimeUsage(now, user, sys);
   } else {
-    sys::Process::GetTimeUsage(now,user,sys);
-    MemUsed = getMemUsage();
+    sys::Process::GetTimeUsage(now, user, sys);
+    Result.MemUsed = getMemUsage();
   }
 
-  Result.Elapsed  = now.seconds()  + now.microseconds()  / 1000000.0;
-  Result.UserTime = user.seconds() + user.microseconds() / 1000000.0;
-  Result.SystemTime  = sys.seconds()  + sys.microseconds()  / 1000000.0;
-  Result.MemUsed  = MemUsed;
-
+  Result.WallTime   =  now.seconds() +  now.microseconds() / 1000000.0;
+  Result.UserTime   = user.seconds() + user.microseconds() / 1000000.0;
+  Result.SystemTime =  sys.seconds() +  sys.microseconds() / 1000000.0;
   return Result;
 }
 
 static ManagedStatic<std::vector<Timer*> > ActiveTimers;
 
 void Timer::startTimer() {
-  sys::SmartScopedLock<true> L(*TimerLock);
   Started = true;
   ActiveTimers->push_back(this);
-  TimeRecord TR = getTimeRecord(true);
-  Elapsed    -= TR.Elapsed;
-  UserTime   -= TR.UserTime;
-  SystemTime -= TR.SystemTime;
-  MemUsed    -= TR.MemUsed;
-  PeakMemBase = TR.MemUsed;
+  Time -= TimeRecord::getCurrentTime(true);
 }
 
 void Timer::stopTimer() {
-  sys::SmartScopedLock<true> L(*TimerLock);
-  TimeRecord TR = getTimeRecord(false);
-  Elapsed    += TR.Elapsed;
-  UserTime   += TR.UserTime;
-  SystemTime += TR.SystemTime;
-  MemUsed    += TR.MemUsed;
+  Time += TimeRecord::getCurrentTime(false);
 
   if (ActiveTimers->back() == this) {
     ActiveTimers->pop_back();
@@ -175,86 +161,8 @@ void Timer::stopTimer() {
   }
 }
 
-void Timer::sum(const Timer &T) {
-  Elapsed    += T.Elapsed;
-  UserTime   += T.UserTime;
-  SystemTime += T.SystemTime;
-  MemUsed    += T.MemUsed;
-  PeakMem    += T.PeakMem;
-}
-
-/// addPeakMemoryMeasurement - This method should be called whenever memory
-/// usage needs to be checked.  It adds a peak memory measurement to the
-/// currently active timers, which will be printed when the timer group prints
-///
-void Timer::addPeakMemoryMeasurement() {
-  sys::SmartScopedLock<true> L(*TimerLock);
-  size_t MemUsed = getMemUsage();
-
-  for (std::vector<Timer*>::iterator I = ActiveTimers->begin(),
-         E = ActiveTimers->end(); I != E; ++I)
-    (*I)->PeakMem = std::max((*I)->PeakMem, MemUsed-(*I)->PeakMemBase);
-}
-
-//===----------------------------------------------------------------------===//
-//   NamedRegionTimer Implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-typedef std::map<std::string, Timer> Name2Timer;
-typedef std::map<std::string, std::pair<TimerGroup, Name2Timer> > Name2Pair;
-
-}
-
-static ManagedStatic<Name2Timer> NamedTimers;
-
-static ManagedStatic<Name2Pair> NamedGroupedTimers;
-
-static Timer &getNamedRegionTimer(const std::string &Name) {
-  sys::SmartScopedLock<true> L(*TimerLock);
-  Name2Timer::iterator I = NamedTimers->find(Name);
-  if (I != NamedTimers->end())
-    return I->second;
-
-  return NamedTimers->insert(I, std::make_pair(Name, Timer(Name)))->second;
-}
-
-static Timer &getNamedRegionTimer(const std::string &Name,
-                                  const std::string &GroupName) {
-  sys::SmartScopedLock<true> L(*TimerLock);
-
-  Name2Pair::iterator I = NamedGroupedTimers->find(GroupName);
-  if (I == NamedGroupedTimers->end()) {
-    TimerGroup TG(GroupName);
-    std::pair<TimerGroup, Name2Timer> Pair(TG, Name2Timer());
-    I = NamedGroupedTimers->insert(I, std::make_pair(GroupName, Pair));
-  }
-
-  Name2Timer::iterator J = I->second.second.find(Name);
-  if (J == I->second.second.end())
-    J = I->second.second.insert(J,
-                                std::make_pair(Name,
-                                               Timer(Name,
-                                                     I->second.first)));
-
-  return J->second;
-}
-
-NamedRegionTimer::NamedRegionTimer(const std::string &Name)
-  : TimeRegion(getNamedRegionTimer(Name)) {}
-
-NamedRegionTimer::NamedRegionTimer(const std::string &Name,
-                                   const std::string &GroupName)
-  : TimeRegion(getNamedRegionTimer(Name, GroupName)) {}
-
-//===----------------------------------------------------------------------===//
-//   TimerGroup Implementation
-//===----------------------------------------------------------------------===//
-
-
 static void printVal(double Val, double Total, raw_ostream &OS) {
-  if (Total < 1e-7)   // Avoid dividing by zero...
+  if (Total < 1e-7)   // Avoid dividing by zero.
     OS << "        -----     ";
   else {
     OS << "  " << format("%7.4f", Val) << " (";
@@ -262,130 +170,214 @@ static void printVal(double Val, double Total, raw_ostream &OS) {
   }
 }
 
-void Timer::print(const Timer &Total, raw_ostream &OS) {
-  sys::SmartScopedLock<true> L(*TimerLock);
-  if (Total.UserTime)
-    printVal(UserTime, Total.UserTime, OS);
-  if (Total.SystemTime)
-    printVal(SystemTime, Total.SystemTime, OS);
+void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const {
+  if (Total.getUserTime())
+    printVal(getUserTime(), Total.getUserTime(), OS);
+  if (Total.getSystemTime())
+    printVal(getSystemTime(), Total.getSystemTime(), OS);
   if (Total.getProcessTime())
     printVal(getProcessTime(), Total.getProcessTime(), OS);
-  printVal(Elapsed, Total.Elapsed, OS);
-
+  printVal(getWallTime(), Total.getWallTime(), OS);
+  
   OS << "  ";
+  
+  if (Total.getMemUsed())
+    OS << format("%9lld", (long long)getMemUsed()) << "  ";
+}
 
-  if (Total.MemUsed) {
-    OS << format("%9lld", (long long)MemUsed) << "  ";
+
+//===----------------------------------------------------------------------===//
+//   NamedRegionTimer Implementation
+//===----------------------------------------------------------------------===//
+
+typedef StringMap<Timer> Name2TimerMap;
+
+class Name2PairMap {
+  StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map;
+public:
+  ~Name2PairMap() {
+    for (StringMap<std::pair<TimerGroup*, Name2TimerMap> >::iterator
+         I = Map.begin(), E = Map.end(); I != E; ++I)
+      delete I->second.first;
   }
-  if (Total.PeakMem) {
-    if (PeakMem) {
-      OS << format("%9lld", (long long)PeakMem) << "  ";
-    } else
-      OS << "           ";
+  
+  Timer &get(StringRef Name, StringRef GroupName) {
+    sys::SmartScopedLock<true> L(*TimerLock);
+    
+    std::pair<TimerGroup*, Name2TimerMap> &GroupEntry = Map[GroupName];
+    
+    if (!GroupEntry.first)
+      GroupEntry.first = new TimerGroup(GroupName);
+    
+    Timer &T = GroupEntry.second[Name];
+    if (!T.isInitialized())
+      T.init(Name, *GroupEntry.first);
+    return T;
   }
-  OS << Name << "\n";
+};
 
-  Started = false;  // Once printed, don't print again
-}
+static ManagedStatic<Name2TimerMap> NamedTimers;
+static ManagedStatic<Name2PairMap> NamedGroupedTimers;
 
-// GetLibSupportInfoOutputFile - Return a file stream to print our output on...
-raw_ostream *
-llvm::GetLibSupportInfoOutputFile() {
-  std::string &LibSupportInfoOutputFilename = getLibSupportInfoOutputFilename();
-  if (LibSupportInfoOutputFilename.empty())
-    return &errs();
-  if (LibSupportInfoOutputFilename == "-")
-    return &outs();
-
-
-  std::string Error;
-  raw_ostream *Result = new raw_fd_ostream(LibSupportInfoOutputFilename.c_str(),
-                                           Error, raw_fd_ostream::F_Append);
-  if (Error.empty())
-    return Result;
-
-  errs() << "Error opening info-output-file '"
-         << LibSupportInfoOutputFilename << " for appending!\n";
-  delete Result;
-  return &errs();
-}
-
-
-void TimerGroup::removeTimer() {
+static Timer &getNamedRegionTimer(StringRef Name) {
   sys::SmartScopedLock<true> L(*TimerLock);
-  if (--NumTimers == 0 && !TimersToPrint.empty()) { // Print timing report...
-    // Sort the timers in descending order by amount of time taken...
-    std::sort(TimersToPrint.begin(), TimersToPrint.end(),
-              std::greater<Timer>());
+  
+  Timer &T = (*NamedTimers)[Name];
+  if (!T.isInitialized())
+    T.init(Name);
+  return T;
+}
 
-    // Figure out how many spaces to indent TimerGroup name...
-    unsigned Padding = (80-Name.length())/2;
-    if (Padding > 80) Padding = 0;         // Don't allow "negative" numbers
+NamedRegionTimer::NamedRegionTimer(StringRef Name)
+  : TimeRegion(getNamedRegionTimer(Name)) {}
 
-    raw_ostream *OutStream = GetLibSupportInfoOutputFile();
+NamedRegionTimer::NamedRegionTimer(StringRef Name, StringRef GroupName)
+  : TimeRegion(NamedGroupedTimers->get(Name, GroupName)) {}
 
-    ++NumTimers;
-    {  // Scope to contain Total timer... don't allow total timer to drop us to
-       // zero timers...
-      Timer Total("TOTAL");
+//===----------------------------------------------------------------------===//
+//   TimerGroup Implementation
+//===----------------------------------------------------------------------===//
 
-      for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i)
-        Total.sum(TimersToPrint[i]);
+/// TimerGroupList - This is the global list of TimerGroups, maintained by the
+/// TimerGroup ctor/dtor and is protected by the TimerLock lock.
+static TimerGroup *TimerGroupList = 0;
 
-      // Print out timing header...
-      *OutStream << "===" << std::string(73, '-') << "===\n"
-                 << std::string(Padding, ' ') << Name << "\n"
-                 << "===" << std::string(73, '-')
-                 << "===\n";
+TimerGroup::TimerGroup(StringRef name)
+  : Name(name.begin(), name.end()), FirstTimer(0) {
+    
+  // Add the group to TimerGroupList.
+  sys::SmartScopedLock<true> L(*TimerLock);
+  if (TimerGroupList)
+    TimerGroupList->Prev = &Next;
+  Next = TimerGroupList;
+  Prev = &TimerGroupList;
+  TimerGroupList = this;
+}
 
-      // If this is not an collection of ungrouped times, print the total time.
-      // Ungrouped timers don't really make sense to add up.  We still print the
-      // TOTAL line to make the percentages make sense.
-      if (this != DefaultTimerGroup) {
-        *OutStream << "  Total Execution Time: ";
+TimerGroup::~TimerGroup() {
+  // If the timer group is destroyed before the timers it owns, accumulate and
+  // print the timing data.
+  while (FirstTimer != 0)
+    removeTimer(*FirstTimer);
+  
+  // Remove the group from the TimerGroupList.
+  sys::SmartScopedLock<true> L(*TimerLock);
+  *Prev = Next;
+  if (Next)
+    Next->Prev = Prev;
+}
 
-        *OutStream << format("%5.4f", Total.getProcessTime()) << " seconds (";
-        *OutStream << format("%5.4f", Total.getWallTime()) << " wall clock)\n";
-      }
-      *OutStream << "\n";
 
-      if (Total.UserTime)
-        *OutStream << "   ---User Time---";
-      if (Total.SystemTime)
-        *OutStream << "   --System Time--";
-      if (Total.getProcessTime())
-        *OutStream << "   --User+System--";
-      *OutStream << "   ---Wall Time---";
-      if (Total.getMemUsed())
-        *OutStream << "  ---Mem---";
-      if (Total.getPeakMem())
-        *OutStream << "  -PeakMem-";
-      *OutStream << "  --- Name ---\n";
+void TimerGroup::removeTimer(Timer &T) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+  
+  // If the timer was started, move its data to TimersToPrint.
+  if (T.Started)
+    TimersToPrint.push_back(std::make_pair(T.Time, T.Name));
 
-      // Loop through all of the timing data, printing it out...
-      for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i)
-        TimersToPrint[i].print(Total, *OutStream);
+  T.TG = 0;
+  
+  // Unlink the timer from our list.
+  *T.Prev = T.Next;
+  if (T.Next)
+    T.Next->Prev = T.Prev;
+  
+  // Print the report when all timers in this group are destroyed if some of
+  // them were started.
+  if (FirstTimer != 0 || TimersToPrint.empty())
+    return;
+  
+  raw_ostream *OutStream = CreateInfoOutputFile();
+  PrintQueuedTimers(*OutStream);
+  delete OutStream;   // Close the file.
+}
 
-      Total.print(Total, *OutStream);
-      *OutStream << '\n';
-      OutStream->flush();
-    }
-    --NumTimers;
+void TimerGroup::addTimer(Timer &T) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+  
+  // Add the timer to our list.
+  if (FirstTimer)
+    FirstTimer->Prev = &T.Next;
+  T.Next = FirstTimer;
+  T.Prev = &FirstTimer;
+  FirstTimer = &T;
+}
 
-    TimersToPrint.clear();
-
-    if (OutStream != &errs() && OutStream != &outs() && OutStream != &dbgs())
-      delete OutStream;   // Close the file...
+void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
+  // Sort the timers in descending order by amount of time taken.
+  std::sort(TimersToPrint.begin(), TimersToPrint.end());
+  
+  TimeRecord Total;
+  for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i)
+    Total += TimersToPrint[i].first;
+  
+  // Print out timing header.
+  OS << "===" << std::string(73, '-') << "===\n";
+  // Figure out how many spaces to indent TimerGroup name.
+  unsigned Padding = (80-Name.length())/2;
+  if (Padding > 80) Padding = 0;         // Don't allow "negative" numbers
+  OS.indent(Padding) << Name << '\n';
+  OS << "===" << std::string(73, '-') << "===\n";
+  
+  // If this is not an collection of ungrouped times, print the total time.
+  // Ungrouped timers don't really make sense to add up.  We still print the
+  // TOTAL line to make the percentages make sense.
+  if (this != DefaultTimerGroup) {
+    OS << "  Total Execution Time: ";
+    OS << format("%5.4f", Total.getProcessTime()) << " seconds (";
+    OS << format("%5.4f", Total.getWallTime()) << " wall clock)\n";
   }
+  OS << '\n';
+  
+  if (Total.getUserTime())
+    OS << "   ---User Time---";
+  if (Total.getSystemTime())
+    OS << "   --System Time--";
+  if (Total.getProcessTime())
+    OS << "   --User+System--";
+  OS << "   ---Wall Time---";
+  if (Total.getMemUsed())
+    OS << "  ---Mem---";
+  OS << "  --- Name ---\n";
+  
+  // Loop through all of the timing data, printing it out.
+  for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i) {
+    const std::pair<TimeRecord, std::string> &Entry = TimersToPrint[e-i-1];
+    Entry.first.print(Total, OS);
+    OS << Entry.second << '\n';
+  }
+  
+  Total.print(Total, OS);
+  OS << "Total\n\n";
+  OS.flush();
+  
+  TimersToPrint.clear();
 }
 
-void TimerGroup::addTimer() {
+/// print - Print any started timers in this group and zero them.
+void TimerGroup::print(raw_ostream &OS) {
   sys::SmartScopedLock<true> L(*TimerLock);
-  ++NumTimers;
+
+  // See if any of our timers were started, if so add them to TimersToPrint and
+  // reset them.
+  for (Timer *T = FirstTimer; T; T = T->Next) {
+    if (!T->Started) continue;
+    TimersToPrint.push_back(std::make_pair(T->Time, T->Name));
+    
+    // Clear out the time.
+    T->Started = 0;
+    T->Time = TimeRecord();
+  }
+
+  // If any timers were started, print the group.
+  if (!TimersToPrint.empty())
+    PrintQueuedTimers(OS);
 }
 
-void TimerGroup::addTimerToPrint(const Timer &T) {
+/// printAll - This static method prints all timers and clears them all out.
+void TimerGroup::printAll(raw_ostream &OS) {
   sys::SmartScopedLock<true> L(*TimerLock);
-  TimersToPrint.push_back(Timer(true, T));
-}
 
+  for (TimerGroup *TG = TimerGroupList; TG; TG = TG->Next)
+    TG->print(OS);
+}
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 61bf0a73c9cb..9796ca56f476 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -189,7 +189,7 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
   return Triple::UnknownArch;
 }
 
-// Returns architecture name that is unsderstood by the target assembler.
+// Returns architecture name that is understood by the target assembler.
 const char *Triple::getArchNameForAssembler() {
   if (getOS() != Triple::Darwin && getVendor() != Triple::Apple)
     return NULL;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 071c924d9195..f59bd0d90301 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -81,9 +81,9 @@ void raw_ostream::SetBuffered() {
     SetUnbuffered();
 }
 
-void raw_ostream::SetBufferAndMode(char *BufferStart, size_t Size, 
+void raw_ostream::SetBufferAndMode(char *BufferStart, size_t Size,
                                     BufferKind Mode) {
-  assert(((Mode == Unbuffered && BufferStart == 0 && Size == 0) || 
+  assert(((Mode == Unbuffered && BufferStart == 0 && Size == 0) ||
           (Mode != Unbuffered && BufferStart && Size)) &&
          "stream must be unbuffered or have at least one byte");
   // Make sure the current buffer is free of content (we can't flush here; the
@@ -104,11 +104,11 @@ raw_ostream &raw_ostream::operator<<(unsigned long N) {
   // Zero is a special case.
   if (N == 0)
     return *this << '0';
-  
+
   char NumberBuffer[20];
   char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
   char *CurPtr = EndPtr;
-  
+
   while (N) {
     *--CurPtr = '0' + char(N % 10);
     N /= 10;
@@ -121,7 +121,7 @@ raw_ostream &raw_ostream::operator<<(long N) {
     *this << '-';
     N = -N;
   }
-  
+
   return this->operator<<(static_cast<unsigned long>(N));
 }
 
@@ -133,7 +133,7 @@ raw_ostream &raw_ostream::operator<<(unsigned long long N) {
   char NumberBuffer[20];
   char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
   char *CurPtr = EndPtr;
-  
+
   while (N) {
     *--CurPtr = '0' + char(N % 10);
     N /= 10;
@@ -146,7 +146,7 @@ raw_ostream &raw_ostream::operator<<(long long N) {
     *this << '-';
     N = -N;
   }
-  
+
   return this->operator<<(static_cast<unsigned long long>(N));
 }
 
@@ -297,33 +297,33 @@ raw_ostream &raw_ostream::operator<<(const format_object_base &Fmt) {
   size_t BufferBytesLeft = OutBufEnd - OutBufCur;
   if (BufferBytesLeft > 3) {
     size_t BytesUsed = Fmt.print(OutBufCur, BufferBytesLeft);
-    
+
     // Common case is that we have plenty of space.
     if (BytesUsed <= BufferBytesLeft) {
       OutBufCur += BytesUsed;
       return *this;
     }
-    
+
     // Otherwise, we overflowed and the return value tells us the size to try
     // again with.
     NextBufferSize = BytesUsed;
   }
-  
+
   // If we got here, we didn't have enough space in the output buffer for the
   // string.  Try printing into a SmallVector that is resized to have enough
   // space.  Iterate until we win.
   SmallVector<char, 128> V;
-  
+
   while (1) {
     V.resize(NextBufferSize);
-    
+
     // Try formatting into the SmallVector.
     size_t BytesUsed = Fmt.print(V.data(), NextBufferSize);
-    
+
     // If BytesUsed fit into the vector, we win.
     if (BytesUsed <= NextBufferSize)
       return write(V.data(), BytesUsed);
-    
+
     // Otherwise, try again with a new size.
     assert(BytesUsed > NextBufferSize && "Didn't grow buffer!?");
     NextBufferSize = BytesUsed;
@@ -339,7 +339,7 @@ raw_ostream &raw_ostream::indent(unsigned NumSpaces) {
   // Usually the indentation is small, handle it with a fastpath.
   if (NumSpaces < array_lengthof(Spaces))
     return write(Spaces, NumSpaces);
-  
+
   while (NumSpaces) {
     unsigned NumToWrite = std::min(NumSpaces,
                                    (unsigned)array_lengthof(Spaces)-1);
@@ -372,7 +372,7 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
   // Verify that we don't have both "append" and "excl".
   assert((!(Flags & F_Excl) || !(Flags & F_Append)) &&
          "Cannot specify both 'excl' and 'append' file creation flags!");
-  
+
   ErrorInfo.clear();
 
   // Handle "-" as stdout.
@@ -385,20 +385,20 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
     ShouldClose = false;
     return;
   }
-  
+
   int OpenFlags = O_WRONLY|O_CREAT;
 #ifdef O_BINARY
   if (Flags & F_Binary)
     OpenFlags |= O_BINARY;
 #endif
-  
+
   if (Flags & F_Append)
     OpenFlags |= O_APPEND;
   else
     OpenFlags |= O_TRUNC;
   if (Flags & F_Excl)
     OpenFlags |= O_EXCL;
-  
+
   FD = open(Filename, OpenFlags, 0664);
   if (FD < 0) {
     ErrorInfo = "Error opening output file '" + std::string(Filename) + "'";
@@ -418,14 +418,14 @@ raw_fd_ostream::~raw_fd_ostream() {
 
 
 void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
-  assert (FD >= 0 && "File already closed.");
+  assert(FD >= 0 && "File already closed.");
   pos += Size;
   if (::write(FD, Ptr, Size) != (ssize_t) Size)
     error_detected();
 }
 
 void raw_fd_ostream::close() {
-  assert (ShouldClose);
+  assert(ShouldClose);
   ShouldClose = false;
   flush();
   if (::close(FD) != 0)
@@ -438,7 +438,7 @@ uint64_t raw_fd_ostream::seek(uint64_t off) {
   pos = ::lseek(FD, off, SEEK_SET);
   if (pos != off)
     error_detected();
-  return pos;  
+  return pos;
 }
 
 size_t raw_fd_ostream::preferred_buffer_size() const {
@@ -447,7 +447,7 @@ size_t raw_fd_ostream::preferred_buffer_size() const {
   struct stat statbuf;
   if (fstat(FD, &statbuf) != 0)
     return 0;
-  
+
   // If this is a terminal, don't use buffering. Line buffering
   // would be a more traditional thing to do, but it's not worth
   // the complexity.
diff --git a/lib/System/Unix/Mutex.inc b/lib/System/Unix/Mutex.inc
index 10e7ecb75a5f..4a5e28de27b0 100644
--- a/lib/System/Unix/Mutex.inc
+++ b/lib/System/Unix/Mutex.inc
@@ -28,12 +28,6 @@ MutexImpl::~MutexImpl()
 {
 }
 
-bool 
-MutexImpl::MutexImpl()
-{
-  return true;
-}
-
 bool 
 MutexImpl::release()
 {
diff --git a/lib/System/Unix/Path.inc b/lib/System/Unix/Path.inc
index a99720c95ded..52253b30a570 100644
--- a/lib/System/Unix/Path.inc
+++ b/lib/System/Unix/Path.inc
@@ -454,7 +454,7 @@ Path::canWrite() const {
 
 bool
 Path::isRegularFile() const {
-  // Get the status so we can determine if its a file or directory
+  // Get the status so we can determine if it's a file or directory
   struct stat buf;
 
   if (0 != stat(path.c_str(), &buf))
@@ -736,7 +736,7 @@ Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
 
 bool
 Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
-  // Get the status so we can determine if its a file or directory
+  // Get the status so we can determine if it's a file or directory.
   struct stat buf;
   if (0 != stat(path.c_str(), &buf)) {
     MakeErrMsg(ErrStr, path + ": can't get status of file");
diff --git a/lib/System/Win32/Program.inc b/lib/System/Win32/Program.inc
index a3b40d0e3650..16bb28e17a21 100644
--- a/lib/System/Win32/Program.inc
+++ b/lib/System/Win32/Program.inc
@@ -138,6 +138,24 @@ static bool ArgNeedsQuotes(const char *Str) {
   return Str[0] == '\0' || strchr(Str, ' ') != 0;
 }
 
+
+/// ArgLenWithQuotes - Check whether argument needs to be quoted when calling
+/// CreateProcess and returns length of quoted arg with escaped quotes
+static unsigned int ArgLenWithQuotes(const char *Str) {
+  unsigned int len = ArgNeedsQuotes(Str) ? 2 : 0;
+
+  while (*Str != '\0') {
+    if (*Str == '\"')
+      ++len;
+
+    ++len;
+    ++Str;
+  }
+
+  return len;
+}
+
+
 bool
 Program::Execute(const Path& path,
                  const char** args,
@@ -165,9 +183,7 @@ Program::Execute(const Path& path,
   // First, determine the length of the command line.
   unsigned len = 0;
   for (unsigned i = 0; args[i]; i++) {
-    len += strlen(args[i]) + 1;
-    if (ArgNeedsQuotes(args[i]))
-      len += 2;
+    len += ArgLenWithQuotes(args[i]) + 1;
   }
 
   // Now build the command line.
@@ -176,12 +192,18 @@ Program::Execute(const Path& path,
 
   for (unsigned i = 0; args[i]; i++) {
     const char *arg = args[i];
-    size_t len = strlen(arg);
+
     bool needsQuoting = ArgNeedsQuotes(arg);
     if (needsQuoting)
       *p++ = '"';
-    memcpy(p, arg, len);
-    p += len;
+
+    while (*arg != '\0') {
+      if (*arg == '\"')
+        *p++ = '\\';
+
+      *p++ = *arg++;
+    }
+
     if (needsQuoting)
       *p++ = '"';
     *p++ = ' ';
diff --git a/lib/System/Win32/Signals.inc b/lib/System/Win32/Signals.inc
index dba22185ac7f..f2b72ca70fdd 100644
--- a/lib/System/Win32/Signals.inc
+++ b/lib/System/Win32/Signals.inc
@@ -163,6 +163,7 @@ void sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
     CallBacksToRun = new std::vector<std::pair<void(*)(void*), void*> >();
   CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
   RegisterHandler();
+  LeaveCriticalSection(&CriticalSection);
 }
 }
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index bbb1dbdc9a4a..6486a608e46d 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -43,6 +43,21 @@ def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2",
 def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
                                      "Enable half-precision floating point">;
 
+// Some processors have multiply-accumulate instructions that don't
+// play nicely with other VFP instructions, and it's generally better
+// to just not use them.
+// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for
+// others as well. We should do more benchmarking and confirm one way or
+// the other.
+def FeatureHasSlowVMLx   : SubtargetFeature<"vmlx", "SlowVMLx", "true",
+                                            "Disable VFP MAC instructions">;
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+                                        "true",
+                                        "Use NEON for single precision FP">;
+
+
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
 //
@@ -92,7 +107,8 @@ def : ProcNoItin<"iwmmxt",          [ArchV5TE]>;
 
 // V6 Processors.
 def : Processor<"arm1136j-s",       ARMV6Itineraries, [ArchV6]>;
-def : Processor<"arm1136jf-s",      ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
+def : Processor<"arm1136jf-s",      ARMV6Itineraries, [ArchV6, FeatureVFP2,
+                                                       FeatureHasSlowVMLx]>;
 def : Processor<"arm1176jz-s",      ARMV6Itineraries, [ArchV6]>;
 def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
 def : Processor<"mpcorenovfp",      ARMV6Itineraries, [ArchV6]>;
@@ -106,7 +122,8 @@ def : Processor<"arm1156t2f-s",    ARMV6Itineraries,
 
 // V7 Processors.
 def : Processor<"cortex-a8",        CortexA8Itineraries,
-                [ArchV7A, FeatureThumb2, FeatureNEON]>;
+                [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx,
+                 FeatureNEONForFP]>;
 def : ProcNoItin<"cortex-a9",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e6ea03aa4aab..0a0b0ea18540 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -204,7 +204,15 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                 bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -275,6 +283,11 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (!isUncondBranchOpcode(I->getOpcode()) &&
       !isCondBranchOpcode(I->getOpcode()))
     return 0;
@@ -738,14 +751,16 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
             RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!");
     // FIXME: Neon instructions should support predicates
     if (Align >= 16 && (getRegisterInfo().canRealignStack(MF))) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q))
                      .addFrameIndex(FI).addImm(128)
                      .addMemOperand(MMO)
                      .addReg(SrcReg, getKillRegState(isKill)));
     } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRQ)).
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ)).
                      addReg(SrcReg, getKillRegState(isKill))
-                     .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+                     .addFrameIndex(FI)
+                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
+                     .addMemOperand(MMO));
     }
   }
 }
@@ -788,12 +803,14 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
             RC == ARM::QPR_8RegisterClass) && "Unknown regclass!");
     if (Align >= 16
         && (getRegisterInfo().canRealignStack(MF))) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg)
                      .addFrameIndex(FI).addImm(128)
                      .addMemOperand(MMO));
     } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg)
-                     .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg)
+                     .addFrameIndex(FI)
+                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
+                     .addMemOperand(MMO));
     }
   }
 }
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 71207c811e45..7d486631897a 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -124,17 +124,17 @@ private:
   /// SelectDYN_ALLOC - Select dynamic alloc for Thumb.
   SDNode *SelectDYN_ALLOC(SDNode *N);
 
-  /// SelectVLD - Select NEON load intrinsics.  NumVecs should
-  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
+  /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// loads of D registers and even subregs and odd subregs of Q registers.
-  /// For NumVecs == 2, QOpcodes1 is not used.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
                     unsigned *QOpcodes0, unsigned *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
-  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
-  /// For NumVecs == 2, QOpcodes1 is not used.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
                     unsigned *QOpcodes0, unsigned *QOpcodes1);
 
@@ -1022,7 +1022,7 @@ static EVT GetNEONSubregVT(EVT VT) {
 SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
@@ -1047,6 +1047,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
+    break;
   }
 
   SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
@@ -1060,15 +1063,15 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
   }
 
   EVT RegVT = GetNEONSubregVT(VT);
-  if (NumVecs == 2) {
-    // Quad registers are directly supported for VLD2,
-    // loading 2 pairs of D regs.
+  if (NumVecs <= 2) {
+    // Quad registers are directly supported for VLD1 and VLD2,
+    // loading pairs of D regs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
     const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
-    std::vector<EVT> ResTys(4, VT);
+    std::vector<EVT> ResTys(2 * NumVecs, RegVT);
     ResTys.push_back(MVT::Other);
     SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5);
-    Chain = SDValue(VLd, 4);
+    Chain = SDValue(VLd, 2 * NumVecs);
 
     // Combine the even and odd subregs to produce the result.
     for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
@@ -1109,7 +1112,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
@@ -1134,6 +1137,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VST1");
+    break;
   }
 
   SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
@@ -1154,9 +1160,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
   }
 
   EVT RegVT = GetNEONSubregVT(VT);
-  if (NumVecs == 2) {
-    // Quad registers are directly supported for VST2,
-    // storing 2 pairs of D regs.
+  if (NumVecs <= 2) {
+    // Quad registers are directly supported for VST1 and VST2,
+    // storing pairs of D regs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
     for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
       Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
@@ -1167,7 +1173,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
     Ops.push_back(Pred);
     Ops.push_back(Reg0); // predicate register
     Ops.push_back(Chain);
-    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 9);
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(),
+                                  5 + 2 * NumVecs);
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
@@ -1694,6 +1701,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       ResNode = SelectARMIndexedLoad(N);
     if (ResNode)
       return ResNode;
+
+    // VLDMQ must be custom-selected for "v2f64 load" to set the AM5Opc value.
+    if (Subtarget->hasVFP2() &&
+        N->getValueType(0).getSimpleVT().SimpleTy == MVT::v2f64) {
+      SDValue Chain = N->getOperand(0);
+      SDValue AM5Opc =
+        CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32);
+      SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+      SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(1), AM5Opc, Pred, PredReg, Chain };
+      return CurDAG->getMachineNode(ARM::VLDMQ, dl, MVT::v2f64, MVT::Other,
+                                    Ops, 5);
+    }
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::STORE: {
+    // VSTMQ must be custom-selected for "v2f64 store" to set the AM5Opc value.
+    if (Subtarget->hasVFP2() &&
+        N->getOperand(1).getValueType().getSimpleVT().SimpleTy == MVT::v2f64) {
+      SDValue Chain = N->getOperand(0);
+      SDValue AM5Opc =
+        CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32);
+      SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+      SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(1), N->getOperand(2),
+                        AM5Opc, Pred, PredReg, Chain };
+      return CurDAG->getMachineNode(ARM::VSTMQ, dl, MVT::Other, Ops, 6);
+    }
     // Other cases are autogenerated.
     break;
   }
@@ -1831,16 +1867,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_neon_vld1: {
+      unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+                              ARM::VLD1d32, ARM::VLD1d64 };
+      unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                              ARM::VLD1q32, ARM::VLD1q64 };
+      return SelectVLD(N, 1, DOpcodes, QOpcodes, 0);
+    }
+
     case Intrinsic::arm_neon_vld2: {
       unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
-                              ARM::VLD2d32, ARM::VLD2d64 };
+                              ARM::VLD2d32, ARM::VLD1q64 };
       unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 };
       return SelectVLD(N, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld3: {
       unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16,
-                              ARM::VLD3d32, ARM::VLD3d64 };
+                              ARM::VLD3d32, ARM::VLD1d64T };
       unsigned QOpcodes0[] = { ARM::VLD3q8_UPD,
                                ARM::VLD3q16_UPD,
                                ARM::VLD3q32_UPD };
@@ -1852,7 +1896,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
     case Intrinsic::arm_neon_vld4: {
       unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16,
-                              ARM::VLD4d32, ARM::VLD4d64 };
+                              ARM::VLD4d32, ARM::VLD1d64Q };
       unsigned QOpcodes0[] = { ARM::VLD4q8_UPD,
                                ARM::VLD4q16_UPD,
                                ARM::VLD4q32_UPD };
@@ -1883,16 +1927,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
+    case Intrinsic::arm_neon_vst1: {
+      unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+                              ARM::VST1d32, ARM::VST1d64 };
+      unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                              ARM::VST1q32, ARM::VST1q64 };
+      return SelectVST(N, 1, DOpcodes, QOpcodes, 0);
+    }
+
     case Intrinsic::arm_neon_vst2: {
       unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
-                              ARM::VST2d32, ARM::VST2d64 };
+                              ARM::VST2d32, ARM::VST1q64 };
       unsigned QOpcodes[] = { ARM::VST2q8, ARM::VST2q16, ARM::VST2q32 };
       return SelectVST(N, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst3: {
       unsigned DOpcodes[] = { ARM::VST3d8, ARM::VST3d16,
-                              ARM::VST3d32, ARM::VST3d64 };
+                              ARM::VST3d32, ARM::VST1d64T };
       unsigned QOpcodes0[] = { ARM::VST3q8_UPD,
                                ARM::VST3q16_UPD,
                                ARM::VST3q32_UPD };
@@ -1904,7 +1956,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
     case Intrinsic::arm_neon_vst4: {
       unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16,
-                              ARM::VST4d32, ARM::VST4d64 };
+                              ARM::VST4d32, ARM::VST1d64Q };
       unsigned QOpcodes0[] = { ARM::VST4q8_UPD,
                                ARM::VST4q16_UPD,
                                ARM::VST4q32_UPD };
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0d0a004c2840..b6c81f6910aa 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -456,6 +456,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     // Generic (and overly aggressive) if-conversion limits.
     setIfCvtBlockSizeLimit(10);
     setIfCvtDupBlockSizeLimit(2);
+  } else if (Subtarget->hasV7Ops()) {
+    setIfCvtBlockSizeLimit(3);
+    setIfCvtDupBlockSizeLimit(1);
   } else if (Subtarget->hasV6Ops()) {
     setIfCvtBlockSizeLimit(2);
     setIfCvtDupBlockSizeLimit(1);
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 4f6f05d40943..4427e50bbdb1 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1,10 +1,10 @@
 //===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -59,7 +59,18 @@ def NEONDupFrm    : Format<28>;
 def MiscFrm       : Format<29>;
 def ThumbMiscFrm  : Format<30>;
 
-def NLdStFrm      : Format<31>;
+def NLdStFrm       : Format<31>;
+def N1RegModImmFrm : Format<32>;
+def N2RegFrm       : Format<33>;
+def NVCVTFrm       : Format<34>;
+def NVDupLnFrm     : Format<35>;
+def N2RegVShLFrm   : Format<36>;
+def N2RegVShRFrm   : Format<37>;
+def N3RegFrm       : Format<38>;
+def N3RegVShFrm    : Format<39>;
+def NVExtFrm       : Format<40>;
+def NVMulSLFrm     : Format<41>;
+def NVTBLFrm       : Format<42>;
 
 // Misc flags.
 
@@ -177,13 +188,13 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
   // TSFlagsFields
   AddrMode AM = am;
   bits<4> AddrModeBits = AM.Value;
-  
+
   SizeFlagVal SZ = sz;
   bits<3> SizeFlag = SZ.Value;
 
   IndexMode IM = im;
   bits<2> IndexModeBits = IM.Value;
-  
+
   Format F = f;
   bits<6> Form = F.Value;
 
@@ -195,7 +206,7 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
   //
   bit isUnaryDataProc = 0;
   bit canXformTo16Bit = 0;
-  
+
   let Constraints = cstr;
   let Itinerary = itin;
 }
@@ -214,9 +225,9 @@ class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im,
                 Format f, Domain d, string cstr, InstrItinClass itin>
   : InstTemplate<am, sz, im, f, d, cstr, itin>;
 
-class PseudoInst<dag oops, dag iops, InstrItinClass itin, 
+class PseudoInst<dag oops, dag iops, InstrItinClass itin,
                  string asm, list<dag> pattern>
-  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, 
+  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain,
             "", itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
@@ -226,7 +237,7 @@ class PseudoInst<dag oops, dag iops, InstrItinClass itin,
 
 // Almost all ARM instructions are predicable.
 class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-        IndexMode im, Format f, InstrItinClass itin, 
+        IndexMode im, Format f, InstrItinClass itin,
         string opc, string asm, string cstr,
         list<dag> pattern>
   : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
@@ -238,9 +249,9 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 }
 // A few are not predicable
 class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-        IndexMode im, Format f, InstrItinClass itin, 
-        string opc, string asm, string cstr,
-        list<dag> pattern>
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr,
+           list<dag> pattern>
   : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
@@ -290,9 +301,9 @@ class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
   : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern>;
 class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
-         string opc, string asm, list<dag> pattern>
+            string opc, string asm, list<dag> pattern>
   : InoP<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern>;
+         opc, asm, "", pattern>;
 
 // Ctrl flow instructions
 class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
@@ -362,7 +373,7 @@ class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24-21} = opcod;
   let Inst{27-26} = {0,0};
 }
-class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern>;
@@ -387,7 +398,7 @@ class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
   : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
@@ -407,7 +418,7 @@ class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
   : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
@@ -549,7 +560,7 @@ class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
 }
 
 // addrmode3 instructions
-class AI3<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AI3<dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern>;
@@ -853,7 +864,6 @@ class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{27-25} = 0b000;
 }
 
-
 // addrmode4 instructions
 class AXI4ld<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
              string asm, string cstr, list<dag> pattern>
@@ -961,20 +971,25 @@ class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
   : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
 
 // Two-address instructions
-class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
-  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>;
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm,
+          list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst",
+           pattern>;
 
 // tBL, tBX 32-bit instructions
 class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
-    dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
-    : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, Encoding {
+           dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+    : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>,
+      Encoding {
   let Inst{31-27} = opcod1;
   let Inst{15-14} = opcod2;
   let Inst{12} = opcod3;
 }
 
 // BR_JT instructions
-class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
   : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
 // Thumb1 only
@@ -1001,7 +1016,7 @@ class T1JTI<dag oops, dag iops, InstrItinClass itin,
 // Two-address instructions
 class T1It<dag oops, dag iops, InstrItinClass itin,
            string asm, string cstr, list<dag> pattern>
-  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, 
+  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin,
             asm, cstr, pattern>;
 
 // Thumb1 instruction that can either be predicated or set CPSR.
@@ -1024,7 +1039,7 @@ class T1sI<dag oops, dag iops, InstrItinClass itin,
 class T1sIt<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
-            "$lhs = $dst", pattern>;
+             "$lhs = $dst", pattern>;
 
 // Thumb1 instruction that can be predicated.
 class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
@@ -1046,7 +1061,7 @@ class T1pI<dag oops, dag iops, InstrItinClass itin,
 class T1pIt<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
-            "$lhs = $dst", pattern>;
+             "$lhs = $dst", pattern>;
 
 class T1pI1<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
@@ -1057,7 +1072,7 @@ class T1pI2<dag oops, dag iops, InstrItinClass itin,
 class T1pI4<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>;
-class T1pIs<dag oops, dag iops, 
+class T1pIs<dag oops, dag iops,
             InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>;
 
@@ -1146,8 +1161,8 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 }
 
 class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-               InstrItinClass itin,
-               string asm, string cstr, list<dag> pattern>
+              InstrItinClass itin,
+              string asm, string cstr, list<dag> pattern>
   : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
@@ -1161,7 +1176,7 @@ class T2I<dag oops, dag iops, InstrItinClass itin,
   : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
 class T2Ii12<dag oops, dag iops, InstrItinClass itin,
              string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>;
+  : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "",pattern>;
 class T2Ii8<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>;
@@ -1196,7 +1211,7 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin,
   : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
 class T2Ix2<dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
+            string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>;
 
 // Two-address instructions
@@ -1295,7 +1310,7 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
            InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-      VFPLdStFrm, itin, opc, asm, "", pattern> {
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
@@ -1309,7 +1324,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
            InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-      VFPLdStFrm, itin, opc, asm, "", pattern> {
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
@@ -1320,7 +1335,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
 class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode5, Size4Bytes, im,
-       VFPLdStMulFrm, itin, asm, cstr, pattern> {
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-8}  = 0b1011;
@@ -1332,7 +1347,7 @@ class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
 class AXSI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode5, Size4Bytes, im,
-       VFPLdStMulFrm, itin, asm, cstr, pattern> {
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-8}  = 0b1010;
@@ -1353,7 +1368,8 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
 
 // Double precision, binary
 class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
-       dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+           dag iops, InstrItinClass itin, string opc, string asm,
+           list<dag> pattern>
   : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
@@ -1362,6 +1378,20 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{4} = op4;
 }
 
+// Double precision, binary, VML[AS] (for additional predicate)
+class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+           dag iops, InstrItinClass itin, string opc, string asm,
+           list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1011;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+  list<Predicate> Predicates = [HasVFP2, UseVMLx];
+}
+
+
 // Single precision, unary
 class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
@@ -1399,7 +1429,8 @@ class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
 // Single precision binary, if no NEON
 // Same as ASbI except not available if NEON is enabled
 class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
-       dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+            dag iops, InstrItinClass itin, string opc, string asm,
+            list<dag> pattern>
   : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
   list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
 }
@@ -1419,8 +1450,8 @@ class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
 
 // VFP conversion between floating-point and fixed-point
 class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5,
-               dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-               list<dag> pattern>
+                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+                list<dag> pattern>
   : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> {
   // size (fixed-point number): sx == 0 ? 16 : 32
   let Inst{7} = op5; // sx
@@ -1448,7 +1479,7 @@ class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
                InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
 
-class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, 
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
                InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
 
@@ -1480,9 +1511,10 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
 }
 
 // Same as NeonI except it does not have a "data type" specifier.
-class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
-            string opc, string asm, string cstr, list<dag> pattern>
-  : InstARM<am, Size4Bytes, im, NEONFrm, NeonDomain, cstr, itin> {
+class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+             InstrItinClass itin, string opc, string asm, string cstr,
+             list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm));
@@ -1490,18 +1522,6 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
   list<Predicate> Predicates = [HasNEON];
 }
 
-class NI<dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-         list<dag> pattern>
-  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm, "",
-          pattern> {
-}
-
-class NI4<dag oops, dag iops, InstrItinClass itin, string opc,
-          string asm, list<dag> pattern>
-  : NeonXI<oops, iops, AddrMode4, IndexModeNone, itin, opc, asm, "",
-          pattern> {
-}
-
 class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
             dag oops, dag iops, InstrItinClass itin,
             string opc, string dt, string asm, string cstr, list<dag> pattern>
@@ -1514,17 +1534,17 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
   let Inst{7-4} = op7_4;
 }
 
-class NDataI<dag oops, dag iops, InstrItinClass itin,
+class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NeonI<oops, iops, AddrModeNone, IndexModeNone, NEONFrm, itin, opc, dt, asm,
-         cstr, pattern> {
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
+          pattern> {
   let Inst{31-25} = 0b1111001;
 }
 
-class NDataXI<dag oops, dag iops, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm,
-         cstr, pattern> {
+class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm,
+           cstr, pattern> {
   let Inst{31-25} = 0b1111001;
 }
 
@@ -1532,8 +1552,9 @@ class NDataXI<dag oops, dag iops, InstrItinClass itin,
 class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
                bit op5, bit op4,
                dag oops, dag iops, InstrItinClass itin,
-               string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+               string opc, string dt, string asm, string cstr,
+               list<dag> pattern>
+  : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> {
   let Inst{23} = op23;
   let Inst{21-19} = op21_19;
   let Inst{11-8} = op11_8;
@@ -1548,7 +1569,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
           bits<5> op11_7, bit op6, bit op4,
           dag oops, dag iops, InstrItinClass itin,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24-23} = op24_23;
   let Inst{21-20} = op21_20;
   let Inst{19-18} = op19_18;
@@ -1560,10 +1581,10 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 
 // Same as N2V except it doesn't have a datatype suffix.
 class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
-          bits<5> op11_7, bit op6, bit op4,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, string cstr, list<dag> pattern>
-  : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> {
+           bits<5> op11_7, bit op6, bit op4,
+           dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> {
   let Inst{24-23} = op24_23;
   let Inst{21-20} = op21_20;
   let Inst{19-18} = op19_18;
@@ -1575,9 +1596,9 @@ class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 
 // NEON 2 vector register with immediate.
 class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
-             dag oops, dag iops, InstrItinClass itin,
+             dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{11-8} = op11_8;
@@ -1588,9 +1609,9 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 
 // NEON 3 vector register format.
 class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, InstrItinClass itin,
+          dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{21-20} = op21_20;
@@ -1599,11 +1620,12 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{4} = op4;
 }
 
-// Same as N3VX except it doesn't have a data type suffix.
-class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, string cstr, list<dag> pattern>
-  : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> {
+// Same as N3V except it doesn't have a data type suffix.
+class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+           bit op4,
+           dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{21-20} = op21_20;
@@ -1617,7 +1639,7 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string dt, string asm, list<dag> pattern>
   : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain,
-    "", itin> {
+            "", itin> {
   let Inst{27-20} = opcod1;
   let Inst{11-8} = opcod2;
   let Inst{6-5} = opcod3;
@@ -1647,6 +1669,19 @@ class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
   : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin,
              opc, dt, asm, pattern>;
 
+// Vector Duplicate Lane (from scalar to all elements)
+class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
+                InstrItinClass itin, string opc, string dt, string asm,
+                list<dag> pattern>
+  : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> {
+  let Inst{24-23} = 0b11;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = op19_16;
+  let Inst{11-7} = 0b11000;
+  let Inst{6} = op6;
+  let Inst{4} = 0;
+}
+
 // NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
 // for single-precision FP.
 class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 26a280697b90..f2ab06f328f2 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -140,6 +140,8 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
 def UseMovt   : Predicate<"Subtarget->useMovt()">;
 def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
 
+def UseVMLx   : Predicate<"Subtarget->useVMLx()">;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index c977cc3f5d64..ed9d31d80f36 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -115,51 +115,80 @@ def h64imm : Operand<i64> {
 // NEON load / store instructions
 //===----------------------------------------------------------------------===//
 
+let mayLoad = 1 in {
 // Use vldmia to load a Q register as a D register pair.
-def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm,
-                "vldmia", "$addr, ${dst:dregpair}",
-                [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> {
-  let Inst{27-25} = 0b110;
-  let Inst{24}    = 0; // P bit
-  let Inst{23}    = 1; // U bit
-  let Inst{20}    = 1;
-  let Inst{11-8}  = 0b1011;
-}
+// This is equivalent to VLDMD except that it has a Q register operand
+// instead of a pair of D registers.
+def VLDMQ
+  : AXDI5<(outs QPR:$dst), (ins addrmode5:$addr, pred:$p),
+          IndexModeNone, IIC_fpLoadm,
+          "vldm${addr:submode}${p}\t${addr:base}, ${dst:dregpair}", "", []>;
+def VLDMQ_UPD
+  : AXDI5<(outs QPR:$dst, GPR:$wb), (ins addrmode5:$addr, pred:$p),
+          IndexModeUpd, IIC_fpLoadm,
+          "vldm${addr:submode}${p}\t${addr:base}!, ${dst:dregpair}",
+          "$addr.base = $wb", []>;
 
+// Use vld1 to load a Q register as a D register pair.
+// This alternative to VLDMQ allows an alignment to be specified.
+// This is equivalent to VLD1q64 except that it has a Q register operand.
+def VLD1q
+  : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr),
+          IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>;
+def VLD1q_UPD
+  : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst, GPR:$wb),
+          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", "64",
+          "${dst:dregpair}, $addr$offset", "$addr.addr = $wb", []>;
+} // mayLoad = 1
+
+let mayStore = 1 in {
 // Use vstmia to store a Q register as a D register pair.
-def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem,
-                "vstmia", "$addr, ${src:dregpair}",
-                [(store (v2f64 QPR:$src), addrmode4:$addr)]> {
-  let Inst{27-25} = 0b110;
-  let Inst{24}    = 0; // P bit
-  let Inst{23}    = 1; // U bit
-  let Inst{20}    = 0;
-  let Inst{11-8}  = 0b1011;
-}
+// This is equivalent to VSTMD except that it has a Q register operand
+// instead of a pair of D registers.
+def VSTMQ
+  : AXDI5<(outs), (ins QPR:$src, addrmode5:$addr, pred:$p),
+          IndexModeNone, IIC_fpStorem,
+          "vstm${addr:submode}${p}\t${addr:base}, ${src:dregpair}", "", []>;
+def VSTMQ_UPD
+  : AXDI5<(outs GPR:$wb), (ins QPR:$src, addrmode5:$addr, pred:$p),
+          IndexModeUpd, IIC_fpStorem,
+          "vstm${addr:submode}${p}\t${addr:base}!, ${src:dregpair}",
+          "$addr.base = $wb", []>;
+
+// Use vst1 to store a Q register as a D register pair.
+// This alternative to VSTMQ allows an alignment to be specified.
+// This is equivalent to VST1q64 except that it has a Q register operand.
+def VST1q
+  : NLdSt<0,0b00,0b1010,0b1100, (outs), (ins addrmode6:$addr, QPR:$src),
+          IIC_VST, "vst1", "64", "${src:dregpair}, $addr", "", []>;
+def VST1q_UPD
+  : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb),
+          (ins addrmode6:$addr, am6offset:$offset, QPR:$src),
+          IIC_VST, "vst1", "64", "{$src:dregpair}, $addr$offset",
+          "$addr.addr = $wb", []>;
+} // mayStore = 1
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
 
 //   VLD1     : Vector Load (multiple single elements)
-class VLD1D<bits<4> op7_4, string Dt, ValueType Ty>
-  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
-          "vld1", Dt, "\\{$dst\\}, $addr", "",
-          [(set DPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>;
-class VLD1Q<bits<4> op7_4, string Dt, ValueType Ty>
-  : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
-          "vld1", Dt, "${dst:dregpair}, $addr", "",
-          [(set QPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>;
+class VLD1D<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst),
+          (ins addrmode6:$addr), IIC_VLD1,
+          "vld1", Dt, "\\{$dst\\}, $addr", "", []>;
+class VLD1Q<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2),
+          (ins addrmode6:$addr), IIC_VLD1,
+          "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>;
 
-def  VLD1d8   : VLD1D<0b0000, "8",  v8i8>;
-def  VLD1d16  : VLD1D<0b0100, "16", v4i16>;
-def  VLD1d32  : VLD1D<0b1000, "32", v2i32>;
-def  VLD1df   : VLD1D<0b1000, "32", v2f32>;
-def  VLD1d64  : VLD1D<0b1100, "64", v1i64>;
+def  VLD1d8   : VLD1D<0b0000, "8">;
+def  VLD1d16  : VLD1D<0b0100, "16">;
+def  VLD1d32  : VLD1D<0b1000, "32">;
+def  VLD1d64  : VLD1D<0b1100, "64">;
 
-def  VLD1q8   : VLD1Q<0b0000, "8",  v16i8>;
-def  VLD1q16  : VLD1Q<0b0100, "16", v8i16>;
-def  VLD1q32  : VLD1Q<0b1000, "32", v4i32>;
-def  VLD1qf   : VLD1Q<0b1000, "32", v4f32>;
-def  VLD1q64  : VLD1Q<0b1100, "64", v2i64>;
-
-let mayLoad = 1 in {
+def  VLD1q8   : VLD1Q<0b0000, "8">;
+def  VLD1q16  : VLD1Q<0b0100, "16">;
+def  VLD1q32  : VLD1Q<0b1000, "32">;
+def  VLD1q64  : VLD1Q<0b1100, "64">;
 
 // ...with address register writeback:
 class VLD1DWB<bits<4> op7_4, string Dt>
@@ -182,54 +211,48 @@ def VLD1q8_UPD  : VLD1QWB<0b0000, "8">;
 def VLD1q16_UPD : VLD1QWB<0b0100, "16">;
 def VLD1q32_UPD : VLD1QWB<0b1000, "32">;
 def VLD1q64_UPD : VLD1QWB<0b1100, "64">;
-} // mayLoad = 1
 
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
-
-// These (dreg triple/quadruple) are for disassembly only.
+// ...with 3 registers (some of these are only for the disassembler):
 class VLD1D3<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
           (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-class VLD1D4<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-          (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-
-def  VLD1d8T  : VLD1D3<0b0000, "8">;
-def  VLD1d16T : VLD1D3<0b0100, "16">;
-def  VLD1d32T : VLD1D3<0b1000, "32">;
-//   VLD1d64T : implemented as VLD3d64
-
-def  VLD1d8Q  : VLD1D4<0b0000, "8">;
-def  VLD1d16Q : VLD1D4<0b0100, "16">;
-def  VLD1d32Q : VLD1D4<0b1000, "32">;
-//   VLD1d64Q : implemented as VLD4d64
-
-// ...with address register writeback:
+          "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
 class VLD1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>;
+
+def VLD1d8T      : VLD1D3<0b0000, "8">;
+def VLD1d16T     : VLD1D3<0b0100, "16">;
+def VLD1d32T     : VLD1D3<0b1000, "32">;
+def VLD1d64T     : VLD1D3<0b1100, "64">;
+
+def VLD1d8T_UPD  : VLD1D3WB<0b0000, "8">;
+def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">;
+def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">;
+def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">;
+
+// ...with 4 registers (some of these are only for the disassembler):
+class VLD1D4<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
+          "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>;
 class VLD1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0010,op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt,
           "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          []>;
 
-def VLD1d8T_UPD  : VLD1D3WB<0b0000, "8">;
-def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">;
-def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">;
-//  VLD1d64T_UPD : implemented as VLD3d64_UPD
+def VLD1d8Q      : VLD1D4<0b0000, "8">;
+def VLD1d16Q     : VLD1D4<0b0100, "16">;
+def VLD1d32Q     : VLD1D4<0b1000, "32">;
+def VLD1d64Q     : VLD1D4<0b1100, "64">;
 
 def VLD1d8Q_UPD  : VLD1D4WB<0b0000, "8">;
 def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">;
 def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">;
-//  VLD1d64Q_UPD : implemented as VLD4d64_UPD
+def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -245,9 +268,6 @@ class VLD2Q<bits<4> op7_4, string Dt>
 def  VLD2d8   : VLD2D<0b1000, 0b0000, "8">;
 def  VLD2d16  : VLD2D<0b1000, 0b0100, "16">;
 def  VLD2d32  : VLD2D<0b1000, 0b1000, "32">;
-def  VLD2d64  : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2),
-                      (ins addrmode6:$addr), IIC_VLD1,
-                      "vld1", "64", "\\{$dst1, $dst2\\}, $addr", "", []>;
 
 def  VLD2q8   : VLD2Q<0b0000, "8">;
 def  VLD2q16  : VLD2Q<0b0100, "16">;
@@ -269,11 +289,6 @@ class VLD2QWB<bits<4> op7_4, string Dt>
 def VLD2d8_UPD  : VLD2DWB<0b1000, 0b0000, "8">;
 def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">;
 def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">;
-def VLD2d64_UPD : NLdSt<0,0b10,0b1010,0b1100,
-                        (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-                        "vld1", "64", "\\{$dst1, $dst2\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 def VLD2q8_UPD  : VLD2QWB<0b0000, "8">;
 def VLD2q16_UPD : VLD2QWB<0b0100, "16">;
@@ -296,10 +311,6 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VLD3d8   : VLD3D<0b0100, 0b0000, "8">;
 def  VLD3d16  : VLD3D<0b0100, 0b0100, "16">;
 def  VLD3d32  : VLD3D<0b0100, 0b1000, "32">;
-def  VLD3d64  : NLdSt<0,0b10,0b0110,0b1100,
-                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
-                      (ins addrmode6:$addr), IIC_VLD1,
-                      "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
 
 // ...with address register writeback:
 class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -312,11 +323,6 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VLD3d8_UPD  : VLD3DWB<0b0100, 0b0000, "8">;
 def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">;
 def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">;
-def VLD3d64_UPD : NLdSt<0,0b10,0b0110,0b1100,
-                        (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-                        "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VLD3q8      : VLD3D<0b0101, 0b0000, "8">;
@@ -341,11 +347,6 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VLD4d8   : VLD4D<0b0000, 0b0000, "8">;
 def  VLD4d16  : VLD4D<0b0000, 0b0100, "16">;
 def  VLD4d32  : VLD4D<0b0000, 0b1000, "32">;
-def  VLD4d64  : NLdSt<0,0b10,0b0010,0b1100,
-                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-                      (ins addrmode6:$addr), IIC_VLD1,
-                      "vld1", "64", "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr",
-                      "", []>;
 
 // ...with address register writeback:
 class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -358,13 +359,6 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VLD4d8_UPD  : VLD4DWB<0b0000, 0b0000, "8">;
 def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">;
 def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">;
-def VLD4d64_UPD : NLdSt<0,0b10,0b0010,0b1100,
-                        (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4,
-                         GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-                        "vld1", "64",
-                        "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VLD4q8      : VLD4D<0b0001, 0b0000, "8">;
@@ -383,62 +377,62 @@ def VLD4q32odd_UPD : VLD4DWB<0b0001, 0b1000, "32">;
 //   FIXME: Not yet implemented.
 
 //   VLD2LN   : Vector Load (single 2-element structure to one lane)
-class VLD2LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2),
+class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
           IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr",
           "$src1 = $dst1, $src2 = $dst2", []>;
 
-def VLD2LNd8  : VLD2LN<0b0001, "8">;
-def VLD2LNd16 : VLD2LN<0b0101, "16"> { let Inst{5} = 0; }
-def VLD2LNd32 : VLD2LN<0b1001, "32"> { let Inst{6} = 0; }
+def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8">;
+def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">;
+def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VLD2LNq16 : VLD2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VLD2LNq32 : VLD2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">;
+def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD2LNq16odd : VLD2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VLD2LNq32odd : VLD2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">;
+def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VLD2LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
+class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt,
           "\\{$dst1[$lane], $dst2[$lane]\\}, $addr$offset",
           "$src1 = $dst1, $src2 = $dst2, $addr.addr = $wb", []>;
 
-def VLD2LNd8_UPD  : VLD2LNWB<0b0001, "8">;
-def VLD2LNd16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 0; }
-def VLD2LNd32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 0; }
+def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8">;
+def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">;
+def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">;
 
-def VLD2LNq16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 1; }
-def VLD2LNq32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 1; }
+def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">;
+def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">;
 
 //   VLD3LN   : Vector Load (single 3-element structure to one lane)
-class VLD3LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
           nohash_imm:$lane), IIC_VLD3, "vld3", Dt,
           "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr",
           "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>;
 
-def VLD3LNd8  : VLD3LN<0b0010, "8"> { let Inst{4} = 0; }
-def VLD3LNd16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VLD3LNd32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8">;
+def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">;
+def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">;
 
 // ...with double-spaced registers:
-def VLD3LNq16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VLD3LNq32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">;
+def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD3LNq16odd : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VLD3LNq32odd : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">;
+def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...with address register writeback:
-class VLD3LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?},
+class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
@@ -447,37 +441,37 @@ class VLD3LNWB<bits<4> op11_8, string Dt>
           "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb",
           []>;
 
-def VLD3LNd8_UPD  : VLD3LNWB<0b0010, "8"> { let Inst{4} = 0; }
-def VLD3LNd16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VLD3LNd32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8">;
+def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">;
+def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">;
 
-def VLD3LNq16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VLD3LNq32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">;
+def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">;
 
 //   VLD4LN   : Vector Load (single 4-element structure to one lane)
-class VLD4LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?},
+class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
           nohash_imm:$lane), IIC_VLD4, "vld4", Dt,
           "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr",
           "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>;
 
-def VLD4LNd8  : VLD4LN<0b0011, "8">;
-def VLD4LNd16 : VLD4LN<0b0111, "16"> { let Inst{5} = 0; }
-def VLD4LNd32 : VLD4LN<0b1011, "32"> { let Inst{6} = 0; }
+def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8">;
+def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">;
+def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VLD4LNq16 : VLD4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VLD4LNq32 : VLD4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">;
+def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD4LNq16odd : VLD4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VLD4LNq32odd : VLD4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">;
+def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VLD4LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?},
+class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
@@ -486,12 +480,12 @@ class VLD4LNWB<bits<4> op11_8, string Dt>
 "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $addr.addr = $wb",
           []>;
 
-def VLD4LNd8_UPD  : VLD4LNWB<0b0011, "8">;
-def VLD4LNd16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 0; }
-def VLD4LNd32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 0; }
+def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8">;
+def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">;
+def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">;
 
-def VLD4LNq16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 1; }
-def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; }
+def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">;
+def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">;
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
@@ -500,32 +494,27 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; }
 //   FIXME: Not yet implemented.
 } // mayLoad = 1, hasExtraDefRegAllocReq = 1
 
-//   VST1     : Vector Store (multiple single elements)
-class VST1D<bits<4> op7_4, string Dt, ValueType Ty>
-  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
-          "vst1", Dt, "\\{$src\\}, $addr", "",
-          [(int_arm_neon_vst1 addrmode6:$addr, (Ty DPR:$src))]>;
-class VST1Q<bits<4> op7_4, string Dt, ValueType Ty>
-  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST,
-          "vst1", Dt, "${src:dregpair}, $addr", "",
-          [(int_arm_neon_vst1 addrmode6:$addr, (Ty QPR:$src))]>;
-
-let hasExtraSrcRegAllocReq = 1 in {
-def  VST1d8   : VST1D<0b0000, "8",  v8i8>;
-def  VST1d16  : VST1D<0b0100, "16", v4i16>;
-def  VST1d32  : VST1D<0b1000, "32", v2i32>;
-def  VST1df   : VST1D<0b1000, "32", v2f32>;
-def  VST1d64  : VST1D<0b1100, "64", v1i64>;
-
-def  VST1q8   : VST1Q<0b0000, "8",  v16i8>;
-def  VST1q16  : VST1Q<0b0100, "16", v8i16>;
-def  VST1q32  : VST1Q<0b1000, "32", v4i32>;
-def  VST1qf   : VST1Q<0b1000, "32", v4f32>;
-def  VST1q64  : VST1Q<0b1100, "64", v2i64>;
-} // hasExtraSrcRegAllocReq
-
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
 
+//   VST1     : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
+          "vst1", Dt, "\\{$src\\}, $addr", "", []>;
+class VST1Q<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+          "vst1", Dt, "\\{$src1, $src2\\}, $addr", "", []>;
+
+def  VST1d8   : VST1D<0b0000, "8">;
+def  VST1d16  : VST1D<0b0100, "16">;
+def  VST1d32  : VST1D<0b1000, "32">;
+def  VST1d64  : VST1D<0b1100, "64">;
+
+def  VST1q8   : VST1Q<0b0000, "8">;
+def  VST1q16  : VST1Q<0b0100, "16">;
+def  VST1q32  : VST1Q<0b1000, "32">;
+def  VST1q64  : VST1Q<0b1100, "64">;
+
 // ...with address register writeback:
 class VST1DWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0111, op7_4, (outs GPR:$wb),
@@ -546,53 +535,50 @@ def VST1q16_UPD : VST1QWB<0b0100, "16">;
 def VST1q32_UPD : VST1QWB<0b1000, "32">;
 def VST1q64_UPD : VST1QWB<0b1100, "64">;
 
-// These (dreg triple/quadruple) are for disassembly only.
+// ...with 3 registers (some of these are only for the disassembler):
 class VST1D3<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-class VST1D4<bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-
-def  VST1d8T  : VST1D3<0b0000, "8">;
-def  VST1d16T : VST1D3<0b0100, "16">;
-def  VST1d32T : VST1D3<0b1000, "32">;
-//   VST1d64T : implemented as VST3d64
-
-def  VST1d8Q  : VST1D4<0b0000, "8">;
-def  VST1d16Q : VST1D4<0b0100, "16">;
-def  VST1d32Q : VST1D4<0b1000, "32">;
-//   VST1d64Q : implemented as VST4d64
-
-// ...with address register writeback:
+          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>;
 class VST1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3),
           IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset",
-          "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          "$addr.addr = $wb", []>;
+
+def VST1d8T      : VST1D3<0b0000, "8">;
+def VST1d16T     : VST1D3<0b0100, "16">;
+def VST1d32T     : VST1D3<0b1000, "32">;
+def VST1d64T     : VST1D3<0b1100, "64">;
+
+def VST1d8T_UPD  : VST1D3WB<0b0000, "8">;
+def VST1d16T_UPD : VST1D3WB<0b0100, "16">;
+def VST1d32T_UPD : VST1D3WB<0b1000, "32">;
+def VST1d64T_UPD : VST1D3WB<0b1100, "64">;
+
+// ...with 4 registers (some of these are only for the disassembler):
+class VST1D4<bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "",
+          []>;
 class VST1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
           IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset",
-          "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          "$addr.addr = $wb", []>;
 
-def VST1d8T_UPD  : VST1D3WB<0b0000, "8">;
-def VST1d16T_UPD : VST1D3WB<0b0100, "16">;
-def VST1d32T_UPD : VST1D3WB<0b1000, "32">;
-//  VST1d64T_UPD : implemented as VST3d64_UPD
+def VST1d8Q      : VST1D4<0b0000, "8">;
+def VST1d16Q     : VST1D4<0b0100, "16">;
+def VST1d32Q     : VST1D4<0b1000, "32">;
+def VST1d64Q     : VST1D4<0b1100, "64">;
 
 def VST1d8Q_UPD  : VST1D4WB<0b0000, "8">;
 def VST1d16Q_UPD : VST1D4WB<0b0100, "16">;
 def VST1d32Q_UPD : VST1D4WB<0b1000, "32">;
-//  VST1d64Q_UPD : implemented as VST4d64_UPD
+def VST1d64Q_UPD : VST1D4WB<0b1100, "64">;
 
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -608,9 +594,6 @@ class VST2Q<bits<4> op7_4, string Dt>
 def  VST2d8   : VST2D<0b1000, 0b0000, "8">;
 def  VST2d16  : VST2D<0b1000, 0b0100, "16">;
 def  VST2d32  : VST2D<0b1000, 0b1000, "32">;
-def  VST2d64  : NLdSt<0,0b00,0b1010,0b1100, (outs),
-                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2\\}, $addr", "", []>;
 
 def  VST2q8   : VST2Q<0b0000, "8">;
 def  VST2q16  : VST2Q<0b0100, "16">;
@@ -632,11 +615,6 @@ class VST2QWB<bits<4> op7_4, string Dt>
 def VST2d8_UPD  : VST2DWB<0b1000, 0b0000, "8">;
 def VST2d16_UPD : VST2DWB<0b1000, 0b0100, "16">;
 def VST2d32_UPD : VST2DWB<0b1000, 0b1000, "32">;
-def VST2d64_UPD : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset,
-                         DPR:$src1, DPR:$src2), IIC_VST,
-                        "vst1", "64", "\\{$src1, $src2\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 def VST2q8_UPD  : VST2QWB<0b0000, "8">;
 def VST2q16_UPD : VST2QWB<0b0100, "16">;
@@ -659,10 +637,6 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VST3d8   : VST3D<0b0100, 0b0000, "8">;
 def  VST3d16  : VST3D<0b0100, 0b0100, "16">;
 def  VST3d32  : VST3D<0b0100, 0b1000, "32">;
-def  VST3d64  : NLdSt<0,0b00,0b0110,0b1100, (outs),
-                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
-                      IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr", "", []>;
 
 // ...with address register writeback:
 class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -675,11 +649,6 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VST3d8_UPD  : VST3DWB<0b0100, 0b0000, "8">;
 def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">;
 def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">;
-def VST3d64_UPD : NLdSt<0,0b00,0b0110,0b1100, (outs GPR:$wb),
-                      (ins addrmode6:$addr, am6offset:$offset,
-                       DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr$offset",
-                      "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VST3q8      : VST3D<0b0101, 0b0000, "8">;
@@ -704,11 +673,6 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VST4d8   : VST4D<0b0000, 0b0000, "8">;
 def  VST4d16  : VST4D<0b0000, 0b0100, "16">;
 def  VST4d32  : VST4D<0b0000, 0b1000, "32">;
-def  VST4d64  : NLdSt<0,0b00,0b0010,0b1100, (outs),
-                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
-                       DPR:$src4), IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2, $src3, $src4\\}, $addr",
-                      "", []>;
 
 // ...with address register writeback:
 class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -721,12 +685,6 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VST4d8_UPD  : VST4DWB<0b0000, 0b0000, "8">;
 def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">;
 def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">;
-def VST4d64_UPD : NLdSt<0,0b00,0b0010,0b1100, (outs GPR:$wb),
-                      (ins addrmode6:$addr, am6offset:$offset,
-                       DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST,
-                      "vst1", "64",
-                      "\\{$src1, $src2, $src3, $src4\\}, $addr$offset",
-                      "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VST4q8      : VST4D<0b0001, 0b0000, "8">;
@@ -745,109 +703,109 @@ def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">;
 //   FIXME: Not yet implemented.
 
 //   VST2LN   : Vector Store (single 2-element structure from one lane)
-class VST2LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs),
+class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
           IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr",
           "", []>;
 
-def VST2LNd8  : VST2LN<0b0001, "8">;
-def VST2LNd16 : VST2LN<0b0101, "16"> { let Inst{5} = 0; }
-def VST2LNd32 : VST2LN<0b1001, "32"> { let Inst{6} = 0; }
+def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8">;
+def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">;
+def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VST2LNq16 : VST2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VST2LNq32 : VST2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">;
+def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST2LNq16odd : VST2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VST2LNq32odd : VST2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">;
+def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VST2LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb),
+class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt,
           "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []>;
 
-def VST2LNd8_UPD  : VST2LNWB<0b0001, "8">;
-def VST2LNd16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 0; }
-def VST2LNd32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 0; }
+def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8">;
+def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">;
+def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">;
 
-def VST2LNq16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 1; }
-def VST2LNq32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 1; }
+def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">;
+def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">;
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
-class VST3LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs),
+class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
            nohash_imm:$lane), IIC_VST, "vst3", Dt,
           "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>;
 
-def VST3LNd8  : VST3LN<0b0010, "8"> { let Inst{4} = 0; }
-def VST3LNd16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VST3LNd32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8">;
+def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">;
+def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">;
 
 // ...with double-spaced registers:
-def VST3LNq16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VST3LNq32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">;
+def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST3LNq16odd : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VST3LNq32odd : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">;
+def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...with address register writeback:
-class VST3LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb),
+class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
           IIC_VST, "vst3", Dt,
           "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []>;
 
-def VST3LNd8_UPD  : VST3LNWB<0b0010, "8"> { let Inst{4} = 0; }
-def VST3LNd16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VST3LNd32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8">;
+def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">;
+def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">;
 
-def VST3LNq16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VST3LNq32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">;
+def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">;
 
 //   VST4LN   : Vector Store (single 4-element structure from one lane)
-class VST4LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs),
+class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
            nohash_imm:$lane), IIC_VST, "vst4", Dt,
           "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr",
           "", []>;
 
-def VST4LNd8  : VST4LN<0b0011, "8">;
-def VST4LNd16 : VST4LN<0b0111, "16"> { let Inst{5} = 0; }
-def VST4LNd32 : VST4LN<0b1011, "32"> { let Inst{6} = 0; }
+def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8">;
+def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">;
+def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VST4LNq16 : VST4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VST4LNq32 : VST4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">;
+def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST4LNq16odd : VST4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VST4LNq32odd : VST4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">;
+def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VST4LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb),
+class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
           IIC_VST, "vst4", Dt,
   "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []>;
 
-def VST4LNd8_UPD  : VST4LNWB<0b0011, "8">;
-def VST4LNd16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 0; }
-def VST4LNd32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 0; }
+def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8">;
+def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">;
+def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">;
 
-def VST4LNq16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 1; }
-def VST4LNq32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 1; }
+def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">;
+def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">;
 
 } // mayStore = 1, hasExtraSrcRegAllocReq = 1
 
@@ -906,18 +864,18 @@ class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "",
+        (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt,"$dst, $src", "",
         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
 class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src", "",
+        (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt,"$dst, $src", "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
 
 // Basic 2-register intrinsics, both double- and quad-register.
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-              bits<2> op17_16, bits<5> op11_7, bit op4, 
+              bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
@@ -966,8 +924,8 @@ class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm,
+        IIC_VBIND, OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
   let isCommutable = Commutable;
 }
 
@@ -975,7 +933,7 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -986,27 +944,28 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 0, op4,
-         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, 
          OpcodeStr, "$dst, $src1, $src2", "",
          [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{
   let isCommutable = Commutable;
 }
+
 class N3VDSL<bits<2> op21_20, bits<4> op11_8, 
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2), imm:$lane)))))]>{
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2),imm:$lane)))))]> {
   let isCommutable = 0;
 }
 class N3VDSL16<bits<2> op21_20, bits<4> op11_8, 
                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        IIC_VMULi16D, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
                         (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> {
@@ -1017,7 +976,7 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, 
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1026,7 +985,7 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 1, op4,
-         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, 
          OpcodeStr, "$dst, $src1, $src2", "",
          [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{
   let isCommutable = Commutable;
@@ -1036,7 +995,7 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8,
              ValueType ResTy, ValueType OpTy, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
@@ -1047,7 +1006,7 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        IIC_VMULi16Q, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
                            (ResTy (NEONvduplane (OpTy DPR_8:$src2),
@@ -1057,10 +1016,10 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
 
 // Basic 3-register intrinsics, both double- and quad-register.
 class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              InstrItinClass itin, string OpcodeStr, string Dt,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), f, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1069,7 +1028,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (IntOp (Ty DPR:$src1),
                          (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
@@ -1080,19 +1039,18 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (IntOp (Ty DPR:$src1),
-                         (Ty (NEONvduplane (Ty DPR_8:$src2),
-                                           imm:$lane)))))]> {
+                         (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> {
   let isCommutable = 0;
 }
 
 class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              InstrItinClass itin, string OpcodeStr, string Dt,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), f, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1102,7 +1060,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
                             (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
@@ -1114,7 +1072,7 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
                             (ResTy (NEONvduplane (OpTy DPR_8:$src2),
@@ -1128,14 +1086,14 @@ class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 ValueType Ty, SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR_VFP2:$dst),
-        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin,
+        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>;
 
 class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
                 ValueType Ty, SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set DPR:$dst, (Ty (OpNode DPR:$src1,
                              (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
@@ -1144,7 +1102,8 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   ValueType Ty, SDNode MulOp, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst),
-        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
@@ -1156,7 +1115,8 @@ class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     ValueType Ty, SDNode MulOp, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst),
-        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
@@ -1168,7 +1128,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
                 SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set QPR:$dst, (Ty (OpNode QPR:$src1,
                              (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
@@ -1177,7 +1137,8 @@ class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   SDNode MulOp, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
@@ -1190,7 +1151,8 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     SDNode MulOp, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
@@ -1204,7 +1166,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
                                       (OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
@@ -1212,7 +1174,7 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
                                       (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
@@ -1223,7 +1185,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType TyQ, ValueType TyD, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set QPR:$dst,
           (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
@@ -1232,7 +1194,8 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
@@ -1244,7 +1207,8 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                    ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
@@ -1257,7 +1221,7 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D,
+        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINi4D,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1268,7 +1232,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1277,8 +1241,8 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), 
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (OpTy DPR:$src1),
                             (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
@@ -1288,7 +1252,7 @@ class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (OpTy DPR:$src1),
                             (OpTy (NEONvduplane (OpTy DPR_8:$src2),
@@ -1299,7 +1263,7 @@ class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1344,17 +1308,17 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 // Shift by immediate,
 // both double- and quad-register.
 class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-             InstrItinClass itin, string OpcodeStr, string Dt,
+             Format f, InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin,
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), f, itin,
            OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
 class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-             InstrItinClass itin, string OpcodeStr, string Dt,
+             Format f, InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), f, itin,
            OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
 
@@ -1363,8 +1327,8 @@ class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD,
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), N2RegVShLFrm,
+           IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
                                           (i32 imm:$SIMM))))]>;
 
@@ -1373,7 +1337,7 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
+           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), N2RegVShRFrm, itin,
            OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
                                           (i32 imm:$SIMM))))]>;
@@ -1383,14 +1347,14 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
-           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set DPR:$dst, (Ty (add DPR:$src1,
                                 (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
 class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
-           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set QPR:$dst, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
@@ -1398,15 +1362,15 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 // Shift by immediate and insert,
 // both double- and quad-register.
 class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+                Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
-           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VSHLiD, 
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), f, IIC_VSHLiD,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
 class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+                Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
-           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VSHLiQ, 
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), f, IIC_VSHLiQ,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
 
@@ -1416,15 +1380,15 @@ class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, 
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), NVCVTFrm,
+           IIC_VUNAD, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
 class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, 
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), NVCVTFrm,
+           IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
 
 //===----------------------------------------------------------------------===//
@@ -1568,24 +1532,24 @@ multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
 // Neon 3-register vector intrinsics.
 
 // First with only element sizes of 16 and 32 bits:
-multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                      InstrItinClass itinD16, InstrItinClass itinD32,
                      InstrItinClass itinQ16, InstrItinClass itinQ32,
                      string OpcodeStr, string Dt,
                      Intrinsic IntOp, bit Commutable = 0> {
   // 64-bit vector types.
-  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16,
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v4i16, v4i16, IntOp, Commutable>;
-  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32,
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32,
                       OpcodeStr, !strconcat(Dt, "32"),
                       v2i32, v2i32, IntOp, Commutable>;
 
   // 128-bit vector types.
-  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16,
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v8i16, v8i16, IntOp, Commutable>;
-  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32,
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32,
                       OpcodeStr, !strconcat(Dt, "32"),
                       v4i32, v4i32, IntOp, Commutable>;
 }
@@ -1605,38 +1569,37 @@ multiclass N3VIntSL_HS<bits<4> op11_8,
 }
 
 // ....then also with element size of 8 bits:
-multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
                       Intrinsic IntOp, bit Commutable = 0>
-  : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+  : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
               OpcodeStr, Dt, IntOp, Commutable> {
-  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16,
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "8"),
                       v8i8, v8i8, IntOp, Commutable>;
-  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16,
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16,
                       OpcodeStr, !strconcat(Dt, "8"),
                       v16i8, v16i8, IntOp, Commutable>;
 }
 
 // ....then also with element size of 64 bits:
-multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
                        Intrinsic IntOp, bit Commutable = 0>
-  : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+  : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp, Commutable> {
-  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32,
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
                       OpcodeStr, !strconcat(Dt, "64"),
                       v1i64, v1i64, IntOp, Commutable>;
-  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32,
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32,
                       OpcodeStr, !strconcat(Dt, "64"),
                       v2i64, v2i64, IntOp, Commutable>;
 }
 
-
 // Neon Narrowing 3-register vector intrinsics,
 //   source operand element sizes of 16, 32 and 64 bits:
 multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
@@ -1866,46 +1829,46 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 
 
 // Neon 2-register vector shift by immediate,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
 //   element sizes of 8, 16, 32 and 64 bits:
 multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
-                      InstrItinClass itin, string OpcodeStr, string Dt,
-                      SDNode OpNode> {
+                     InstrItinClass itin, string OpcodeStr, string Dt,
+                     SDNode OpNode, Format f> {
   // 64-bit vector types.
-  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, itin,
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
                              // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, itin,
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
                              // imm6 = xxxxxx
 }
 
-
 // Neon Shift-Accumulate vector operations,
 //   element sizes of 8, 16, 32 and 64 bits:
 multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
@@ -1947,41 +1910,43 @@ multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 
 
 // Neon Shift-Insert vector operations,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
 //   element sizes of 8, 16, 32 and 64 bits:
 multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
-                         string OpcodeStr, SDNode ShOp> {
+                         string OpcodeStr, SDNode ShOp,
+                         Format f> {
   // 64-bit vector types.
   def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "8", v8i8, ShOp> {
+                        f, OpcodeStr, "8", v8i8, ShOp> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
   def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "16", v4i16, ShOp> {
+                        f, OpcodeStr, "16", v4i16, ShOp> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
   def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "32", v2i32, ShOp> {
+                        f, OpcodeStr, "32", v2i32, ShOp> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
   def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4,
-                        OpcodeStr, "64", v1i64, ShOp>;
+                        f, OpcodeStr, "64", v1i64, ShOp>;
                              // imm6 = xxxxxx
 
   // 128-bit vector types.
   def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "8", v16i8, ShOp> {
+                        f, OpcodeStr, "8", v16i8, ShOp> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
   def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "16", v8i16, ShOp> {
+                        f, OpcodeStr, "16", v8i16, ShOp> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
   def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "32", v4i32, ShOp> {
+                        f, OpcodeStr, "32", v4i32, ShOp> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
   def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4,
-                        OpcodeStr, "64", v2i64, ShOp>;
+                        f, OpcodeStr, "64", v2i64, ShOp>;
                              // imm6 = xxxxxx
 }
 
@@ -2044,20 +2009,26 @@ defm VADDLu   : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl", "u",
 defm VADDWs   : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>;
 defm VADDWu   : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>;
 //   VHADD    : Vector Halving Add
-defm VHADDs   : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vhadd", "s", int_arm_neon_vhadds, 1>;
-defm VHADDu   : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vhadd", "u", int_arm_neon_vhaddu, 1>;
+defm VHADDs   : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "u", int_arm_neon_vhaddu, 1>;
 //   VRHADD   : Vector Rounding Halving Add
-defm VRHADDs  : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vrhadd", "s", int_arm_neon_vrhadds, 1>;
-defm VRHADDu  : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
+defm VRHADDs  : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
 //   VQADD    : Vector Saturating Add
-defm VQADDs   : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                            IIC_VBINi4Q, "vqadd", "s", int_arm_neon_vqadds, 1>;
-defm VQADDu   : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                            IIC_VBINi4Q, "vqadd", "u", int_arm_neon_vqaddu, 1>;
+defm VQADDs   : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "u", int_arm_neon_vqaddu, 1>;
 //   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
 defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i",
                             int_arm_neon_vaddhn, 1>;
@@ -2070,10 +2041,10 @@ defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
 //   VMUL     : Vector Multiply (integer, polynomial and floating-point)
 defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D,
                         IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>;
-def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul", "p8",
-                        v8i8, v8i8, int_arm_neon_vmulp, 1>;
-def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul", "p8",
-                        v16i8, v16i8, int_arm_neon_vmulp, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
+                        "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
+                        "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
 def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32",
                      v2f32, v2f32, fmul, 1>;
 def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32",
@@ -2103,7 +2074,7 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
                            (SubReg_i32_lane imm:$lane)))>;
 
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
-defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
                           IIC_VMULi16Q, IIC_VMULi32Q, 
                           "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
 defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
@@ -2125,8 +2096,8 @@ def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
                                  (SubReg_i32_lane imm:$lane)))>;
 
 //   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
-defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
-                            IIC_VMULi16Q, IIC_VMULi32Q,
+defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
+                            IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q,
                             "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>;
 defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
                               IIC_VMULi16Q, IIC_VMULi32Q,
@@ -2285,18 +2256,18 @@ defm VSUBLu   : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl", "u",
 defm VSUBWs   : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>;
 defm VSUBWu   : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>;
 //   VHSUB    : Vector Halving Subtract
-defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vhsub", "s", int_arm_neon_vhsubs, 0>;
-defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vhsub", "u", int_arm_neon_vhsubu, 0>;
 //   VQSUB    : Vector Saturing Subtract
-defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D,
-                            IIC_VBINi4Q, IIC_VBINi4Q,
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                             "vqsub", "s", int_arm_neon_vqsubs, 0>;
-defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D,
-                            IIC_VBINi4Q, IIC_VBINi4Q,
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                             "vqsub", "u", int_arm_neon_vqsubu, 0>;
 //   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
 defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i",
@@ -2323,8 +2294,8 @@ defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                         IIC_VBINi4Q, "vcge", "s", NEONvcge, 0>;
 defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
                         IIC_VBINi4Q, "vcge", "u", NEONvcgeu, 0>;
-def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32",
-                     v2i32, v2f32, NEONvcge, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+                     NEONvcge, 0>;
 def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
                      NEONvcge, 0>;
 // For disassembly only.
@@ -2351,21 +2322,27 @@ defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
                             "$dst, $src, #0">;
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
-def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge", "f32",
-                        v2i32, v2f32, int_arm_neon_vacged, 0>;
-def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge", "f32",
-                        v4i32, v4f32, int_arm_neon_vacgeq, 0>;
+def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+                        "f32", v2i32, v2f32, int_arm_neon_vacged, 0>;
+def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>;
 //   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
-def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt", "f32",
-                        v2i32, v2f32, int_arm_neon_vacgtd, 0>;
-def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt", "f32",
-                        v4i32, v4f32, int_arm_neon_vacgtq, 0>;
+def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+                        "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>;
+def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>;
 //   VTST     : Vector Test Bits
 defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
                         IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
 
 // Vector Bitwise Operations.
 
+def vnot8 : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v8i8 immAllOnesV)))>;
+def vnot16 : PatFrag<(ops node:$in),
+                     (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
+
+
 //   VAND     : Vector Bitwise AND
 def  VANDd    : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand",
                       v2i32, v2i32, and, 1>;
@@ -2386,74 +2363,80 @@ def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
 
 //   VBIC     : Vector Bitwise Bit Clear (AND NOT)
 def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
-                    "vbic", "$dst, $src1, $src2", "",
-                    [(set DPR:$dst, (v2i32 (and DPR:$src1,
-                                                (vnot_conv DPR:$src2))))]>;
+                     (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD,
+                     "vbic", "$dst, $src1, $src2", "",
+                     [(set DPR:$dst, (v2i32 (and DPR:$src1,
+                                                 (vnot8 DPR:$src2))))]>;
 def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
-                    "vbic", "$dst, $src1, $src2", "",
-                    [(set QPR:$dst, (v4i32 (and QPR:$src1,
-                                                (vnot_conv QPR:$src2))))]>;
+                     (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ,
+                     "vbic", "$dst, $src1, $src2", "",
+                     [(set QPR:$dst, (v4i32 (and QPR:$src1,
+                                                 (vnot16 QPR:$src2))))]>;
 
 //   VORN     : Vector Bitwise OR NOT
 def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
-                    "vorn", "$dst, $src1, $src2", "",
-                    [(set DPR:$dst, (v2i32 (or DPR:$src1,
-                                               (vnot_conv DPR:$src2))))]>;
+                     (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD,
+                     "vorn", "$dst, $src1, $src2", "",
+                     [(set DPR:$dst, (v2i32 (or DPR:$src1,
+                                                (vnot8 DPR:$src2))))]>;
 def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
-                    "vorn", "$dst, $src1, $src2", "",
-                    [(set QPR:$dst, (v4i32 (or QPR:$src1,
-                                               (vnot_conv QPR:$src2))))]>;
+                     (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ,
+                     "vorn", "$dst, $src1, $src2", "",
+                     [(set QPR:$dst, (v4i32 (or QPR:$src1,
+                                                (vnot16 QPR:$src2))))]>;
 
 //   VMVN     : Vector Bitwise NOT
 def  VMVNd    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
-                    (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD,
-                    "vmvn", "$dst, $src", "",
-                    [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>;
+                     (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD,
+                     "vmvn", "$dst, $src", "",
+                     [(set DPR:$dst, (v2i32 (vnot8 DPR:$src)))]>;
 def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
-                    (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD,
-                    "vmvn", "$dst, $src", "",
-                    [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>;
-def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>;
-def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>;
+                     (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD,
+                     "vmvn", "$dst, $src", "",
+                     [(set QPR:$dst, (v4i32 (vnot16 QPR:$src)))]>;
+def : Pat<(v2i32 (vnot8 DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnot16 QPR:$src)), (VMVNq QPR:$src)>;
 
 //   VBSL     : Vector Bitwise Select
 def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD,
-                    "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
-                    [(set DPR:$dst,
-                      (v2i32 (or (and DPR:$src2, DPR:$src1),
-                                 (and DPR:$src3, (vnot_conv DPR:$src1)))))]>;
+                     (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                     N3RegFrm, IIC_VCNTiD,
+                     "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
+                     [(set DPR:$dst,
+                       (v2i32 (or (and DPR:$src2, DPR:$src1),
+                                  (and DPR:$src3, (vnot8 DPR:$src1)))))]>;
 def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ,
-                    "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
-                    [(set QPR:$dst,
-                      (v4i32 (or (and QPR:$src2, QPR:$src1),
-                                 (and QPR:$src3, (vnot_conv QPR:$src1)))))]>;
+                     (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                     N3RegFrm, IIC_VCNTiQ,
+                     "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
+                     [(set QPR:$dst,
+                       (v4i32 (or (and QPR:$src2, QPR:$src1),
+                                  (and QPR:$src3, (vnot16 QPR:$src1)))))]>;
 
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
 def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
                      (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
-                     IIC_VBINiD, "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiD,
+                     "vbif", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
                      (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
-                     IIC_VBINiQ, "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbif", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 
 //   VBIT     : Vector Bitwise Insert if True
 //              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
 def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
                      (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
-                     IIC_VBINiD, "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiD,
+                     "vbit", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
                      (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
-                     IIC_VBINiQ, "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbit", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 
 // VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
@@ -2463,15 +2446,15 @@ def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
 // Vector Absolute Differences.
 
 //   VABD     : Vector Absolute Difference
-defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vabd", "s", int_arm_neon_vabds, 0>;
-defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vabd", "u", int_arm_neon_vabdu, 0>;
-def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND,
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
                         "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>;
-def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ,
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
                         "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>;
 
 //   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
@@ -2491,36 +2474,40 @@ defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>;
 // Vector Maximum and Minimum.
 
 //   VMAX     : Vector Maximum
-defm VMAXs    : N3VInt_QHS<0,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmax", "s", int_arm_neon_vmaxs, 1>;
-defm VMAXu    : N3VInt_QHS<1,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmax", "u", int_arm_neon_vmaxu, 1>;
-def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax", "f32",
-                        v2f32, v2f32, int_arm_neon_vmaxs, 1>;
-def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax", "f32",
-                        v4f32, v4f32, int_arm_neon_vmaxs, 1>;
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmax", "s", int_arm_neon_vmaxs, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmax", "u", int_arm_neon_vmaxu, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax",
+                        "f32", v2f32, v2f32, int_arm_neon_vmaxs, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax",
+                        "f32", v4f32, v4f32, int_arm_neon_vmaxs, 1>;
 
 //   VMIN     : Vector Minimum
-defm VMINs    : N3VInt_QHS<0,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmin", "s", int_arm_neon_vmins, 1>;
-defm VMINu    : N3VInt_QHS<1,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmin", "u", int_arm_neon_vminu, 1>;
-def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin", "f32",
-                        v2f32, v2f32, int_arm_neon_vmins, 1>;
-def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin", "f32",
-                        v4f32, v4f32, int_arm_neon_vmins, 1>;
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmin", "s", int_arm_neon_vmins, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmin", "u", int_arm_neon_vminu, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin",
+                        "f32", v2f32, v2f32, int_arm_neon_vmins, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin",
+                        "f32", v4f32, v4f32, int_arm_neon_vmins, 1>;
 
 // Vector Pairwise Operations.
 
 //   VPADD    : Vector Pairwise Add
-def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd", "i8",
-                        v8i8, v8i8, int_arm_neon_vpadd, 0>;
-def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd", "i16",
-                        v4i16, v4i16, int_arm_neon_vpadd, 0>;
-def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd", "i32",
-                        v2i32, v2i32, int_arm_neon_vpadd, 0>;
-def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd", "f32",
-                        v2f32, v2f32, int_arm_neon_vpadd, 0>;
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd",
+                        "i8", v8i8, v8i8, int_arm_neon_vpadd, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd",
+                        "i16", v4i16, v4i16, int_arm_neon_vpadd, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd",
+                        "i32", v2i32, v2i32, int_arm_neon_vpadd, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, IIC_VBIND, "vpadd",
+                        "f32", v2f32, v2f32, int_arm_neon_vpadd, 0>;
 
 //   VPADDL   : Vector Pairwise Add Long
 defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
@@ -2535,36 +2522,36 @@ defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u",
                               int_arm_neon_vpadalu>;
 
 //   VPMAX    : Vector Pairwise Maximum
-def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "s8",
-                        v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
-def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "s16",
-                        v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
-def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "s32",
-                        v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
-def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "u8",
-                        v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
-def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "u16",
-                        v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
-def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "u32",
-                        v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
-def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax", "f32",
-                        v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
 
 //   VPMIN    : Vector Pairwise Minimum
-def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "s8",
-                        v8i8, v8i8, int_arm_neon_vpmins, 0>;
-def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "s16",
-                        v4i16, v4i16, int_arm_neon_vpmins, 0>;
-def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "s32",
-                        v2i32, v2i32, int_arm_neon_vpmins, 0>;
-def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "u8",
-                        v8i8, v8i8, int_arm_neon_vpminu, 0>;
-def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "u16",
-                        v4i16, v4i16, int_arm_neon_vpminu, 0>;
-def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "u32",
-                        v2i32, v2i32, int_arm_neon_vpminu, 0>;
-def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin", "f32",
-                        v2f32, v2f32, int_arm_neon_vpmins, 0>;
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>;
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>;
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>;
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>;
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
 
 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
 
@@ -2583,10 +2570,10 @@ def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
                         v4f32, v4f32, int_arm_neon_vrecpe>;
 
 //   VRECPS   : Vector Reciprocal Step
-def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1,
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
                         IIC_VRECSD, "vrecps", "f32",
                         v2f32, v2f32, int_arm_neon_vrecps, 1>;
-def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1,
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
                         IIC_VRECSQ, "vrecps", "f32",
                         v4f32, v4f32, int_arm_neon_vrecps, 1>;
 
@@ -2605,25 +2592,30 @@ def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
                          v4f32, v4f32, int_arm_neon_vrsqrte>;
 
 //   VRSQRTS  : Vector Reciprocal Square Root Step
-def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1,
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
                         IIC_VRECSD, "vrsqrts", "f32",
                         v2f32, v2f32, int_arm_neon_vrsqrts, 1>;
-def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1,
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
                         IIC_VRECSQ, "vrsqrts", "f32",
                         v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
 
 // Vector Shifts.
 
 //   VSHL     : Vector Shift
-defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
-                            IIC_VSHLiQ, "vshl", "s", int_arm_neon_vshifts, 0>;
-defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
-                            IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu, 0>;
+defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "s", int_arm_neon_vshifts, 0>;
+defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "u", int_arm_neon_vshiftu, 0>;
 //   VSHL     : Vector Shift Left (Immediate)
-defm VSHLi    : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+defm VSHLi    : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl,
+                           N2RegVShLFrm>;
 //   VSHR     : Vector Shift Right (Immediate)
-defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs>;
-defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru>;
+defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs,
+                           N2RegVShRFrm>;
+defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru,
+                           N2RegVShRFrm>;
 
 //   VSHLL    : Vector Shift Left Long
 defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>;
@@ -2649,28 +2641,37 @@ defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
                            NEONvshrn>;
 
 //   VRSHL    : Vector Rounding Shift
-defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vrshl", "s", int_arm_neon_vrshifts,0>;
-defm VRSHLu   : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu,0>;
+defm VRSHLs   : N3VInt_QHSD<0, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "s", int_arm_neon_vrshifts, 0>;
+defm VRSHLu   : N3VInt_QHSD<1, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "u", int_arm_neon_vrshiftu, 0>;
 //   VRSHR    : Vector Rounding Shift Right
-defm VRSHRs   : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs>;
-defm VRSHRu   : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru>;
+defm VRSHRs   : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs,
+                           N2RegVShRFrm>;
+defm VRSHRu   : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru,
+                           N2RegVShRFrm>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
 defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
                            NEONvrshrn>;
 
 //   VQSHL    : Vector Saturating Shift
-defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqshl", "s", int_arm_neon_vqshifts,0>;
-defm VQSHLu   : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu,0>;
+defm VQSHLs   : N3VInt_QHSD<0, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "s", int_arm_neon_vqshifts, 0>;
+defm VQSHLu   : N3VInt_QHSD<1, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "u", int_arm_neon_vqshiftu, 0>;
 //   VQSHL    : Vector Saturating Shift Left (Immediate)
-defm VQSHLsi  : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s", NEONvqshls>;
-defm VQSHLui  : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u", NEONvqshlu>;
+defm VQSHLsi  : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls,
+                           N2RegVShLFrm>;
+defm VQSHLui  : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu,
+                           N2RegVShLFrm>;
 //   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
-defm VQSHLsu  : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D, "vqshlu","s",NEONvqshlsu>;
+defm VQSHLsu  : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu,
+                           N2RegVShLFrm>;
 
 //   VQSHRN   : Vector Saturating Shift Right and Narrow
 defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
@@ -2683,12 +2684,12 @@ defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
                            NEONvqshrnsu>;
 
 //   VQRSHL   : Vector Saturating Rounding Shift
-defm VQRSHLs  : N3VInt_QHSD<0,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqrshl", "s",
-                            int_arm_neon_vqrshifts, 0>;
-defm VQRSHLu  : N3VInt_QHSD<1,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqrshl", "u",
-                            int_arm_neon_vqrshiftu, 0>;
+defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "s", int_arm_neon_vqrshifts, 0>;
+defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "u", int_arm_neon_vqrshiftu, 0>;
 
 //   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
 defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
@@ -2708,9 +2709,9 @@ defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
 defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
 
 //   VSLI     : Vector Shift Left and Insert
-defm VSLI     : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli>;
+defm VSLI     : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli, N2RegVShLFrm>;
 //   VSRI     : Vector Shift Right and Insert
-defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri>;
+defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>;
 
 // Vector Absolute and Saturating Absolute.
 
@@ -2732,19 +2733,22 @@ defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
 
 // Vector Negate.
 
-def vneg      : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
-def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>;
+def vneg   : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
+def vneg8  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v8i8 immAllZerosV)), node:$in)>;
+def vneg16 : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v16i8 immAllZerosV)), node:$in)>;
 
 class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
         IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (Ty (vneg DPR:$src)))]>;
+        [(set DPR:$dst, (Ty (vneg8 DPR:$src)))]>;
 class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
         IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (Ty (vneg QPR:$src)))]>;
+        [(set QPR:$dst, (Ty (vneg16 QPR:$src)))]>;
 
-//   VNEG     : Vector Negate
+//   VNEG     : Vector Negate (integer)
 def  VNEGs8d  : VNEGD<0b00, "vneg", "s8", v8i8>;
 def  VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>;
 def  VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>;
@@ -2762,12 +2766,12 @@ def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
                     "vneg", "f32", "$dst, $src", "",
                     [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
 
-def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>;
-def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>;
-def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>;
-def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>;
-def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>;
-def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>;
+def : Pat<(v8i8  (vneg8  DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vneg8  DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vneg8  DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vneg16 QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vneg16 QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vneg16 QPR:$src)), (VNEGs32q QPR:$src)>;
 
 //   VQNEG    : Vector Saturating Negate
 defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, 
@@ -2805,9 +2809,9 @@ def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
 //   VMOV     : Vector Move (Register)
 
 def  VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
-                    IIC_VMOVD, "vmov", "$dst, $src", "", []>;
+                     N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>;
 def  VMOVQ    : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
-                    IIC_VMOVD, "vmov", "$dst, $src", "", []>;
+                     N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -3048,30 +3052,29 @@ def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
 
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
-class VDUPLND<bits<2> op19_18, bits<2> op17_16,
-              string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
-        (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
-        OpcodeStr, Dt, "$dst, $src[$lane]", "",
-        [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
+class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType Ty>
+  : NVDupLane<op19_16, 0, (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]",
+              [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
 
-class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, string Dt,
+class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
-        OpcodeStr, Dt, "$dst, $src[$lane]", "",
-        [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>;
+  : NVDupLane<op19_16, 1, (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]",
+              [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src),
+                                      imm:$lane)))]>;
 
 // Inst{19-16} is partially specified depending on the element size.
 
-def VDUPLN8d  : VDUPLND<{?,?}, {?,1}, "vdup", "8", v8i8>;
-def VDUPLN16d : VDUPLND<{?,?}, {1,0}, "vdup", "16", v4i16>;
-def VDUPLN32d : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2i32>;
-def VDUPLNfd  : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2f32>;
-def VDUPLN8q  : VDUPLNQ<{?,?}, {?,1}, "vdup", "8", v16i8, v8i8>;
-def VDUPLN16q : VDUPLNQ<{?,?}, {1,0}, "vdup", "16", v8i16, v4i16>;
-def VDUPLN32q : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4i32, v2i32>;
-def VDUPLNfq  : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4f32, v2f32>;
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8>;
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16>;
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32>;
+def VDUPLNfd  : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32>;
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8>;
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16>;
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32>;
+def VDUPLNfq  : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32>;
 
 def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
           (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
@@ -3233,15 +3236,15 @@ def VREV16q8  : VREV16Q<0b00, "vrev16", "8", v16i8>;
 
 class VEXTd<string OpcodeStr, string Dt, ValueType Ty>
   : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst),
-        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
-        OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
+        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), NVExtFrm,
+        IIC_VEXTD, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
         [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
                                       (Ty DPR:$rhs), imm:$index)))]>;
 
 class VEXTq<string OpcodeStr, string Dt, ValueType Ty>
   : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst),
-        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
-        OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
+        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), NVExtFrm,
+        IIC_VEXTQ, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
         [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
                                       (Ty QPR:$rhs), imm:$index)))]>;
 
@@ -3290,25 +3293,26 @@ def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
 //   VTBL     : Vector Table Lookup
 def  VTBL1
   : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$src), IIC_VTB1,
+        (ins DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTB1,
         "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>;
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBL2
   : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2,
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2,
         "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2
                                DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
 def  VTBL3
   : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3,
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3,
         "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3
                                DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
 def  VTBL4
   : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4,
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src),
+        NVTBLFrm, IIC_VTB4,
         "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2,
                                DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
@@ -3317,26 +3321,27 @@ def  VTBL4
 //   VTBX     : Vector Table Extension
 def  VTBX1
   : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1,
+        (ins DPR:$orig, DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTBX1,
         "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1
                                DPR:$orig, DPR:$tbl1, DPR:$src)))]>;
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBX2
   : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2,
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2
                                DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
 def  VTBX3
   : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3,
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src),
+        NVTBLFrm, IIC_VTBX3,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1,
                                DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
 def  VTBX4
   : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
-        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4,
+        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src",
         "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1,
@@ -3396,12 +3401,12 @@ def : N3VSPat<fmul, VMULfd_sfp>;
 
 //let neverHasSideEffects = 1 in
 //def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
-//                            v2f32, fmul, fadd>;
+//                           v2f32, fmul, fadd>;
 //def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>;
 
 //let neverHasSideEffects = 1 in
 //def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
-//                            v2f32, fmul, fsub>;
+//                           v2f32, fmul, fsub>;
 //def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>;
 
 // Vector Absolute used for single-precision FP
@@ -3421,14 +3426,14 @@ def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>;
 // Vector Maximum used for single-precision FP
 let neverHasSideEffects = 1 in
 def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
-                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND,
                      "vmax", "f32", "$dst, $src1, $src2", "", []>;
 def : N3VSPat<NEONfmax, VMAXfd_sfp>;
 
 // Vector Minimum used for single-precision FP
 let neverHasSideEffects = 1 in
 def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
-                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND,
                      "vmin", "f32", "$dst, $src1, $src2", "", []>;
 def : N3VSPat<NEONfmin, VMINfd_sfp>;
 
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index aca823022126..045838928660 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -545,7 +545,7 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1,
 // FP FMA Operations.
 //
 
-def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+def VMLAD : ADbI_vmlX<0b11100, 0b00, 0, 0,
                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                 IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b",
                 [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b),
@@ -558,7 +558,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                  [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
                  RegConstraint<"$dstin = $dst">;
 
-def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+def VNMLSD : ADbI_vmlX<0b11100, 0b01, 0, 0,
                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                 IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b",
                 [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b),
@@ -571,7 +571,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                 [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst">;
 
-def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+def VMLSD : ADbI_vmlX<0b11100, 0b00, 1, 0,
                  (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                  IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b",
              [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)),
@@ -589,7 +589,7 @@ def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>;
 
-def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+def VNMLAD : ADbI_vmlX<0b11100, 0b01, 1, 0,
                  (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                  IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b",
              [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)),
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index bdbec30d8596..cb762a4669fd 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -341,6 +341,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   unsigned PReg = PMO.getReg();
   unsigned PRegNum = PMO.isUndef() ? UINT_MAX
     : ARMRegisterInfo::getRegisterNumbering(PReg);
+  unsigned Count = 1;
 
   for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
     int NewOffset = MemOps[i].Offset;
@@ -350,11 +351,14 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
       : ARMRegisterInfo::getRegisterNumbering(Reg);
     // AM4 - register numbers in ascending order.
     // AM5 - consecutive register numbers in ascending order.
+    //       Can only do up to 16 double-word registers per insn.
     if (Reg != ARM::SP &&
         NewOffset == Offset + (int)Size &&
-        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
+        ((isAM4 && RegNum > PRegNum)
+         || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) {
       Offset += Size;
       PRegNum = RegNum;
+      ++Count;
     } else {
       // Can't merge this in. Try merge the earlier ones first.
       MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 2dad7f110621..9e55cd870031 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -22,10 +22,6 @@ using namespace llvm;
 static cl::opt<bool>
 ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
-static cl::opt<bool>
-UseNEONFP("arm-use-neon-fp",
-          cl::desc("Use NEON for single-precision FP"),
-          cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
 UseMOVT("arm-use-movt",
@@ -35,7 +31,8 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
                            bool isT)
   : ARMArchVersion(V4)
   , ARMFPUType(None)
-  , UseNEONForSinglePrecisionFP(UseNEONFP)
+  , UseNEONForSinglePrecisionFP(false)
+  , SlowVMLx(false)
   , IsThumb(isT)
   , ThumbMode(Thumb1)
   , PostRAScheduler(false)
@@ -115,14 +112,6 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
 
   if (!isThumb() || hasThumb2())
     PostRAScheduler = true;
-
-  // Set CPU specific features.
-  if (CPUString == "cortex-a8") {
-    // On Cortex-a8, it's faster to perform some single-precision FP
-    // operations with NEON instructions.
-    if (UseNEONFP.getPosition() == 0)
-      UseNEONForSinglePrecisionFP = true;
-  }
 }
 
 /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 2dc81a4d6d26..fa56a9195bc2 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -50,6 +50,10 @@ protected:
   /// determine if NEON should actually be used.
   bool UseNEONForSinglePrecisionFP;
 
+  /// SlowVMLx - If the VFP2 instructions are available, indicates whether
+  /// the VML[AS] instructions are slow (if so, don't use them).
+  bool SlowVMLx;
+
   /// IsThumb - True if we are in thumb mode, false if in ARM mode.
   bool IsThumb;
 
@@ -119,6 +123,7 @@ protected:
   bool hasNEON() const { return ARMFPUType >= NEON;  }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
+  bool useVMLx() const {return hasVFP2() && !SlowVMLx; }
 
   bool hasFP16() const { return HasFP16; }
 
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 4a7a1e43bd36..ba736e3c0777 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -841,7 +841,7 @@ GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
   raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix()
     << getFunctionNumber() << '_' << uid << '_' << uid2
     << "_set_" << MBB->getNumber();
-  return OutContext.GetOrCreateTemporarySymbol(Name.str());
+  return OutContext.GetOrCreateSymbol(Name.str());
 }
 
 MCSymbol *ARMAsmPrinter::
@@ -849,7 +849,7 @@ GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
   SmallString<60> Name;
   raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI"
     << getFunctionNumber() << '_' << uid << '_' << uid2;
-  return OutContext.GetOrCreateTemporarySymbol(Name.str());
+  return OutContext.GetOrCreateSymbol(Name.str());
 }
 
 void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) {
@@ -1132,6 +1132,11 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
           OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
         else
           // Internal to current translation unit.
+          //
+          // When we place the LSDA into the TEXT section, the type info pointers
+          // need to be indirect and pc-rel. We accomplish this by using NLPs.
+          // However, sometimes the types are local to the file. So we need to
+          // fill in the value for the NLP in those cases.
           OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                         OutContext),
                                 4/*size*/, 0/*addrspace*/);
@@ -1186,7 +1191,7 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
     // FIXME: MOVE TO SHARED PLACE.
     unsigned Id = (unsigned)MI->getOperand(2).getImm();
     const char *Prefix = MAI->getPrivateGlobalPrefix();
-    MCSymbol *Label =OutContext.GetOrCreateTemporarySymbol(Twine(Prefix)
+    MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix)
                          + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id));
     OutStreamer.EmitLabel(Label);
     
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
index 7cb305fc41c7..ab2b06b60783 100644
--- a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
@@ -75,7 +75,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
 #endif
   
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
 
 MCSymbol *ARMMCInstLower::
@@ -91,7 +91,7 @@ GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
 #endif
   
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
   
 MCOperand ARMMCInstLower::
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
index c36fe63b0693..7334259bf574 100644
--- a/lib/Target/ARM/NEONPreAllocPass.cpp
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -46,10 +46,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   default:
     break;
 
+  case ARM::VLD1q8:
+  case ARM::VLD1q16:
+  case ARM::VLD1q32:
+  case ARM::VLD1q64:
   case ARM::VLD2d8:
   case ARM::VLD2d16:
   case ARM::VLD2d32:
-  case ARM::VLD2d64:
   case ARM::VLD2LNd8:
   case ARM::VLD2LNd16:
   case ARM::VLD2LNd32:
@@ -83,7 +86,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VLD3d8:
   case ARM::VLD3d16:
   case ARM::VLD3d32:
-  case ARM::VLD3d64:
+  case ARM::VLD1d64T:
   case ARM::VLD3LNd8:
   case ARM::VLD3LNd16:
   case ARM::VLD3LNd32:
@@ -128,7 +131,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VLD4d8:
   case ARM::VLD4d16:
   case ARM::VLD4d32:
-  case ARM::VLD4d64:
+  case ARM::VLD1d64Q:
   case ARM::VLD4LNd8:
   case ARM::VLD4LNd16:
   case ARM::VLD4LNd32:
@@ -170,10 +173,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     Stride = 2;
     return true;
 
+  case ARM::VST1q8:
+  case ARM::VST1q16:
+  case ARM::VST1q32:
+  case ARM::VST1q64:
   case ARM::VST2d8:
   case ARM::VST2d16:
   case ARM::VST2d32:
-  case ARM::VST2d64:
   case ARM::VST2LNd8:
   case ARM::VST2LNd16:
   case ARM::VST2LNd32:
@@ -207,7 +213,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VST3d8:
   case ARM::VST3d16:
   case ARM::VST3d32:
-  case ARM::VST3d64:
+  case ARM::VST1d64T:
   case ARM::VST3LNd8:
   case ARM::VST3LNd16:
   case ARM::VST3LNd32:
@@ -252,7 +258,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VST4d8:
   case ARM::VST4d16:
   case ARM::VST4d32:
-  case ARM::VST4d64:
+  case ARM::VST1d64Q:
   case ARM::VST4LNd8:
   case ARM::VST4LNd16:
   case ARM::VST4LNd32:
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 57b65cf15d02..85d5ca05913c 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -12,6 +12,9 @@ Reimplement 'select' in terms of 'SEL'.
 
 A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX
 
+Interesting optimization for PIC codegen on arm-linux:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129
+
 //===---------------------------------------------------------------------===//
 
 Crazy idea:  Consider code that uses lots of 8-bit or 16-bit values.  By the
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 29ae631269a1..ad98839ce262 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -200,6 +200,8 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   // It's illegal to emit pop instruction without operands.
   if (NumRegs)
     MBB.insert(MI, &*MIB);
+  else
+    MF.DeleteMachineInstr(MIB);
 
   return true;
 }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index e4abcdb2def5..55163f9b820c 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -69,7 +69,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == ARM::GPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
@@ -93,7 +93,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == ARM::GPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index 39f0749ec424..d539e082118b 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -301,7 +301,15 @@ bool AlphaInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TB
                                    bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -362,6 +370,11 @@ unsigned AlphaInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (I->getOpcode() != Alpha::BR && 
       I->getOpcode() != Alpha::COND_BRANCH_I &&
       I->getOpcode() != Alpha::COND_BRANCH_F)
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.td b/lib/Target/Blackfin/BlackfinInstrInfo.td
index e3c3993a0ce3..2471688b9052 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.td
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.td
@@ -65,23 +65,23 @@ def HI16 : SDNodeXForm<imm, [{
 //===----------------------------------------------------------------------===//
 
 def imm3  : PatLeaf<(imm), [{return isInt<3>(N->getSExtValue());}]>;
-def uimm3 : PatLeaf<(imm), [{return isUint<3>(N->getZExtValue());}]>;
-def uimm4 : PatLeaf<(imm), [{return isUint<4>(N->getZExtValue());}]>;
-def uimm5 : PatLeaf<(imm), [{return isUint<5>(N->getZExtValue());}]>;
+def uimm3 : PatLeaf<(imm), [{return isUInt<3>(N->getZExtValue());}]>;
+def uimm4 : PatLeaf<(imm), [{return isUInt<4>(N->getZExtValue());}]>;
+def uimm5 : PatLeaf<(imm), [{return isUInt<5>(N->getZExtValue());}]>;
 
 def uimm5m2 : PatLeaf<(imm), [{
     uint64_t value = N->getZExtValue();
-    return value % 2 == 0 && isUint<5>(value);
+    return value % 2 == 0 && isUInt<5>(value);
 }]>;
 
 def uimm6m4 : PatLeaf<(imm), [{
     uint64_t value = N->getZExtValue();
-    return value % 4 == 0 && isUint<6>(value);
+    return value % 4 == 0 && isUInt<6>(value);
 }]>;
 
 def imm7   : PatLeaf<(imm), [{return isInt<7>(N->getSExtValue());}]>;
 def imm16  : PatLeaf<(imm), [{return isInt<16>(N->getSExtValue());}]>;
-def uimm16 : PatLeaf<(imm), [{return isUint<16>(N->getZExtValue());}]>;
+def uimm16 : PatLeaf<(imm), [{return isUInt<16>(N->getZExtValue());}]>;
 
 def ximm16 : PatLeaf<(imm), [{
     int64_t value = N->getSExtValue();
@@ -610,8 +610,7 @@ def MOVE_ncccc : F1<(outs NotCC:$cc), (ins JustCC:$sb),
                     "cc = !cc;", []>;
 
 def MOVECC_zext : F1<(outs D:$dst), (ins JustCC:$cc),
-                      "$dst = $cc;",
-                     [/*(set D:$dst, (zext JustCC:$cc))*/]>;
+                      "$dst = $cc;", []>;
 
 def MOVENCC_z : F1<(outs D:$dst), (ins NotCC:$cc),
                    "$dst = cc;", []>;
@@ -859,17 +858,5 @@ def : Pat<(BfinCall (i32 tglobaladdr:$dst)),
           (CALLa tglobaladdr:$dst)>;
 def : Pat<(BfinCall (i32 texternalsym:$dst)),
           (CALLa texternalsym:$dst)>;
-
-//def : Pat<(sext JustCC:$cc),
-//          (NEG (MOVECC_zext JustCC:$cc))>;
-//def : Pat<(anyext JustCC:$cc),
-//          (MOVECC_zext JustCC:$cc)>;
-def : Pat<(i16 (zext JustCC:$cc)),
-          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
-def : Pat<(i16 (sext JustCC:$cc)),
-          (EXTRACT_SUBREG (NEG (MOVECC_zext JustCC:$cc)), bfin_subreg_lo16)>;
-def : Pat<(i16 (anyext JustCC:$cc)),
-          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
-
 def : Pat<(i16 (trunc D:$src)),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS D:$src, D)), bfin_subreg_lo16)>;
diff --git a/lib/Target/Blackfin/BlackfinIntrinsics.td b/lib/Target/Blackfin/BlackfinIntrinsics.td
index bf02cfecf851..ce21b082376f 100644
--- a/lib/Target/Blackfin/BlackfinIntrinsics.td
+++ b/lib/Target/Blackfin/BlackfinIntrinsics.td
@@ -21,14 +21,14 @@ let TargetPrefix = "bfin", isTarget = 1 in {
 
 // Execute csync instruction with workarounds
 def int_bfin_csync : GCCBuiltin<"__builtin_bfin_csync">,
-        Intrinsic<[llvm_void_ty]>;
+        Intrinsic<[]>;
 
 // Execute ssync instruction with workarounds
 def int_bfin_ssync : GCCBuiltin<"__builtin_bfin_ssync">,
-        Intrinsic<[llvm_void_ty]>;
+        Intrinsic<[]>;
 
 // Execute idle instruction with workarounds
 def int_bfin_idle : GCCBuiltin<"__builtin_bfin_idle">,
-        Intrinsic<[llvm_void_ty]>;
+        Intrinsic<[]>;
 
 }
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index b39a342cecb3..84dc9cac504a 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -164,7 +164,7 @@ void BlackfinRegisterInfo::loadConstant(MachineBasicBlock &MBB,
     return;
   }
 
-  if (isUint<16>(value)) {
+  if (isUInt<16>(value)) {
     BuildMI(MBB, I, DL, TII.get(BF::LOADuimm16), Reg).addImm(value);
     return;
   }
@@ -255,13 +255,13 @@ BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     assert(FIPos==1 && "Bad frame index operand");
     MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
     MI.getOperand(FIPos+1).setImm(Offset);
-    if (isUint<6>(Offset)) {
+    if (isUInt<6>(Offset)) {
       MI.setDesc(TII.get(isStore
                          ? BF::STORE32p_uimm6m4
                          : BF::LOAD32p_uimm6m4));
       return 0;
     }
-    if (BaseReg == BF::FP && isUint<7>(-Offset)) {
+    if (BaseReg == BF::FP && isUInt<7>(-Offset)) {
       MI.setDesc(TII.get(isStore
                          ? BF::STORE32fp_nimm7m4
                          : BF::LOAD32fp_nimm7m4));
diff --git a/lib/Target/CellSPU/SPU.h b/lib/Target/CellSPU/SPU.h
index c96097494ea3..1f215113b405 100644
--- a/lib/Target/CellSPU/SPU.h
+++ b/lib/Target/CellSPU/SPU.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_TARGET_IBMCELLSPU_H
 #define LLVM_TARGET_IBMCELLSPU_H
 
-#include "llvm/System/DataTypes.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -25,73 +24,7 @@ namespace llvm {
 
   FunctionPass *createSPUISelDag(SPUTargetMachine &TM);
 
-  /*--== Utility functions/predicates/etc used all over the place: --==*/
-  //! Predicate test for a signed 10-bit value
-  /*!
-    \param Value The input value to be tested
-
-    This predicate tests for a signed 10-bit value, returning the 10-bit value
-    as a short if true.
-   */
-  template<typename T>
-  inline bool isS10Constant(T Value);
-
-  template<>
-  inline bool isS10Constant<short>(short Value) {
-    int SExtValue = ((int) Value << (32 - 10)) >> (32 - 10);
-    return ((Value > 0 && Value <= (1 << 9) - 1)
-            || (Value < 0 && (short) SExtValue == Value));
-  }
-
-  template<>
-  inline bool isS10Constant<int>(int Value) {
-    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
-  }
-
-  template<>
-  inline bool isS10Constant<uint32_t>(uint32_t Value) {
-    return (Value <= ((1 << 9) - 1));
-  }
-
-  template<>
-  inline bool isS10Constant<int64_t>(int64_t Value) {
-    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
-  }
-
-  template<>
-  inline bool isS10Constant<uint64_t>(uint64_t Value) {
-    return (Value <= ((1 << 9) - 1));
-  }
-
-  //! Predicate test for an unsigned 10-bit value
-  /*!
-    \param Value The input value to be tested
-
-    This predicate tests for an unsigned 10-bit value, returning the 10-bit value
-    as a short if true.
-   */
-  inline bool isU10Constant(short Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(int Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(uint32_t Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(int64_t Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(uint64_t Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
   extern Target TheCellSPUTarget;
-
 }
 
 // Defines symbolic names for the SPU instructions.
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 396a92159089..90f83100cfab 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -44,28 +44,28 @@ namespace {
   bool
   isI64IntS10Immediate(ConstantSDNode *CN)
   {
-    return isS10Constant(CN->getSExtValue());
+    return isInt<10>(CN->getSExtValue());
   }
 
   //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
   bool
   isI32IntS10Immediate(ConstantSDNode *CN)
   {
-    return isS10Constant(CN->getSExtValue());
+    return isInt<10>(CN->getSExtValue());
   }
 
   //! ConstantSDNode predicate for i32 unsigned 10-bit immediate values
   bool
   isI32IntU10Immediate(ConstantSDNode *CN)
   {
-    return isU10Constant(CN->getSExtValue());
+    return isUInt<10>(CN->getSExtValue());
   }
 
   //! ConstantSDNode predicate for i16 sign-extended, 10-bit immediate values
   bool
   isI16IntS10Immediate(ConstantSDNode *CN)
   {
-    return isS10Constant(CN->getSExtValue());
+    return isInt<10>(CN->getSExtValue());
   }
 
   //! SDNode predicate for i16 sign-extended, 10-bit immediate values
@@ -80,7 +80,7 @@ namespace {
   bool
   isI16IntU10Immediate(ConstantSDNode *CN)
   {
-    return isU10Constant((short) CN->getZExtValue());
+    return isUInt<10>((short) CN->getZExtValue());
   }
 
   //! SDNode predicate for i16 sign-extended, 10-bit immediate values
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index e863ee31e412..4b0d4429d28b 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -1107,7 +1107,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
       VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset,
                                                  true, false);
       SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
-      SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
+      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
+      SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0,
                                    false, false, 0);
       Chain = Store.getOperand(0);
@@ -1491,7 +1492,7 @@ SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
         return SDValue();
       Value = Value >> 32;
     }
-    if (isS10Constant(Value))
+    if (isInt<10>(Value))
       return DAG.getTargetConstant(Value, ValueType);
   }
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 2306665e1a38..86825c81861b 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -450,7 +450,15 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                             bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -513,6 +521,11 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   if (I == MBB.begin())
     return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (!isCondBranch(I) && !isUncondBranch(I))
     return 0;
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index 5068f77a85be..6d1f87dd30ba 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -1268,7 +1268,12 @@ multiclass BitwiseAnd
 
 defm AND : BitwiseAnd;
 
-// N.B.: vnot_conv is one of those special target selection pattern fragments,
+
+def vnot_cell_conv : PatFrag<(ops node:$in),
+                             (xor node:$in, (bitconvert (v4i32 immAllOnesV)))>;
+
+// N.B.: vnot_cell_conv is one of those special target selection pattern
+// fragments,
 // in which we expect there to be a bit_convert on the constant. Bear in mind
 // that llvm translates "not <reg>" to "xor <reg>, -1" (or in this case, a
 // constant -1 vector.)
@@ -1301,7 +1306,7 @@ multiclass AndComplement
   def r8:   ANDCRegInst<R8C>;
 
   // Sometimes, the xor pattern has a bitcast constant:
-  def v16i8_conv: ANDCVecInst<v16i8, vnot_conv>;
+  def v16i8_conv: ANDCVecInst<v16i8, vnot_cell_conv>;
 }
 
 defm ANDC : AndComplement;
@@ -1934,7 +1939,7 @@ multiclass SelectBits
   def v16i8: SELBVecInst<v16i8>;
   def v8i16: SELBVecInst<v8i16>;
   def v4i32: SELBVecInst<v4i32>;
-  def v2i64: SELBVecInst<v2i64, vnot_conv>;
+  def v2i64: SELBVecInst<v2i64, vnot_cell_conv>;
 
   def r128:  SELBRegInst<GPRC>;
   def r64:   SELBRegInst<R64C>;
@@ -4373,7 +4378,7 @@ def : Pat<(v2f64 (bitconvert (v16i8 VECREG:$src))), (v2f64 VECREG:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 VECREG:$src))), (v2f64 VECREG:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 VECREG:$src))), (v2f64 VECREG:$src)>;
 def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>;
-def : Pat<(v2f64 (bitconvert (v2f64 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VECREG:$src))), (v2f64 VECREG:$src)>;
 
 def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))),
           (ORi128_vec VECREG:$src)>;
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 8c78bab37b6f..f3071f2e5697 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -336,6 +337,7 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = II->getDebugLoc();
 
   while (!MI.getOperand(i).isFI()) {
     ++i;
@@ -364,11 +366,22 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   // Replace the FrameIndex with base register with $sp (aka $r1)
   SPOp.ChangeToRegister(SPU::R1, false);
-  if (Offset > SPUFrameInfo::maxFrameOffset()
-      || Offset < SPUFrameInfo::minFrameOffset()) {
-    errs() << "Large stack adjustment ("
-         << Offset
-         << ") in SPURegisterInfo::eliminateFrameIndex.";
+
+  // if 'Offset' doesn't fit to the D-form instruction's
+  // immediate, convert the instruction to X-form
+  // if the instruction is not an AI (which takes a s10 immediate), assume
+  // it is a load/store that can take a s14 immediate
+  if ((MI.getOpcode() == SPU::AIr32 && !isInt<10>(Offset))
+      || !isInt<14>(Offset)) {
+    int newOpcode = convertDFormToXForm(MI.getOpcode());
+    unsigned tmpReg = findScratchRegister(II, RS, &SPU::R32CRegClass, SPAdj);
+    BuildMI(MBB, II, dl, TII.get(SPU::ILr32), tmpReg )
+        .addImm(Offset);
+    BuildMI(MBB, II, dl, TII.get(newOpcode), MI.getOperand(0).getReg())
+        .addReg(tmpReg, RegState::Kill)
+        .addReg(SPU::R1);
+    // remove the replaced D-form instruction
+    MBB.erase(II);
   } else {
     MO.ChangeToImmediate(Offset);
   }
@@ -423,6 +436,14 @@ void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   MF.getRegInfo().setPhysRegUnused(SPU::R0);
   MF.getRegInfo().setPhysRegUnused(SPU::R1);
   MF.getRegInfo().setPhysRegUnused(SPU::R2);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo(); 
+  const TargetRegisterClass *RC = &SPU::R32CRegClass;
+  RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                     RC->getAlignment(),
+                                                     false));
+  
+  
 }
 
 void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
@@ -448,7 +469,8 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
   assert((FrameSize & 0xf) == 0
          && "SPURegisterInfo::emitPrologue: FrameSize not aligned");
 
-  if (FrameSize > 0 || MFI->hasCalls()) {
+  // the "empty" frame size is 16 - just the register scavenger spill slot
+  if (FrameSize > 16 || MFI->hasCalls()) {
     FrameSize = -(FrameSize + SPUFrameInfo::minStackSize());
     if (hasDebugInfo) {
       // Mark effective beginning of when frame pointer becomes valid.
@@ -460,14 +482,14 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
     // for the ABI
     BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16)
       .addReg(SPU::R1);
-    if (isS10Constant(FrameSize)) {
+    if (isInt<10>(FrameSize)) {
       // Spill $sp to adjusted $sp
       BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize)
         .addReg(SPU::R1);
       // Adjust $sp by required amout
       BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1)
         .addImm(FrameSize);
-    } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) {
+    } else if (isInt<16>(FrameSize)) {
       // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
       // $r2 to adjust $sp:
       BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
@@ -475,7 +497,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
         .addReg(SPU::R1);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
         .addImm(FrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1)
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1)
         .addReg(SPU::R2)
         .addReg(SPU::R1);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
@@ -549,9 +571,11 @@ SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
          "Can only insert epilog into returning blocks");
   assert((FrameSize & 0xf) == 0
          && "SPURegisterInfo::emitEpilogue: FrameSize not aligned");
-  if (FrameSize > 0 || MFI->hasCalls()) {
+
+  // the "empty" frame size is 16 - just the register scavenger spill slot
+  if (FrameSize > 16 || MFI->hasCalls()) {
     FrameSize = FrameSize + SPUFrameInfo::minStackSize();
-    if (isS10Constant(FrameSize + LinkSlotOffset)) {
+    if (isInt<10>(FrameSize + LinkSlotOffset)) {
       // Reload $lr, adjust $sp by required amount
       // Note: We do this to slightly improve dual issue -- not by much, but it
       // is an opportunity for dual issue.
@@ -574,7 +598,7 @@ SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
         .addReg(SPU::R2);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
         .addImm(16)
-        .addReg(SPU::R2);
+        .addReg(SPU::R1);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2).
         addReg(SPU::R2)
         .addImm(16);
@@ -618,4 +642,43 @@ SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int 
+SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
+{
+  switch(dFormOpcode) 
+  {
+    case SPU::AIr32:     return SPU::Ar32;
+    case SPU::LQDr32:    return SPU::LQXr32;
+    case SPU::LQDr128:   return SPU::LQXr128;
+    case SPU::LQDv16i8:  return SPU::LQXv16i8;
+    case SPU::LQDv4f32:  return SPU::LQXv4f32;
+    case SPU::STQDr32:   return SPU::STQXr32;
+    case SPU::STQDr128:  return SPU::STQXr128;
+    case SPU::STQDv16i8: return SPU::STQXv16i8;
+    case SPU::STQDv4i32: return SPU::STQXv4i32;
+    case SPU::STQDv4f32: return SPU::STQXv4f32;
+
+    default: assert( false && "Unhandled D to X-form conversion");
+  }
+  // default will assert, but need to return something to keep the
+  // compiler happy.
+  return dFormOpcode;
+}
+
+// TODO this is already copied from PPC. Could this convenience function
+// be moved to the RegScavenger class?
+unsigned  
+SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II, 
+                                     RegScavenger *RS,
+                                     const TargetRegisterClass *RC, 
+                                     int SPAdj) const
+{
+  assert(RS && "Register scavenging must be on");
+  unsigned Reg = RS->FindUnusedReg(RC);
+  if (Reg == 0)
+    Reg = RS->scavengeRegister(RC, II, SPAdj);
+  assert( Reg && "Register scavenger failed");
+  return Reg;
+}
+
 #include "SPUGenRegisterInfo.inc"
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 48feb5c452a4..0a7031835696 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -53,6 +53,10 @@ namespace llvm {
     virtual const TargetRegisterClass* const *
       getCalleeSavedRegClasses(const MachineFunction *MF) const;
 
+    //! Allow for scavenging, so we can get scratch registers when needed.
+    virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
+    { return true; }
+
     //! Return the reserved registers
     BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -97,6 +101,21 @@ namespace llvm {
 
     //! Get DWARF debugging register number
     int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+    //! Convert D-form load/store to X-form load/store
+    /*!
+      Converts a regiser displacement load/store into a register-indexed
+      load/store for large stack frames, when the stack frame exceeds the
+      range of a s10 displacement.
+     */
+    int convertDFormToXForm(int dFormOpcode) const;
+
+    //! Acquire an unused register in an emergency.
+    unsigned findScratchRegister(MachineBasicBlock::iterator II,
+                                 RegScavenger *RS,
+                                 const TargetRegisterClass *RC, 
+                                 int SPAdj) const;
+    
   };
 } // end namespace llvm
 
diff --git a/lib/Target/MBlaze/MBlazeIntrinsics.td b/lib/Target/MBlaze/MBlazeIntrinsics.td
index 76eb56367ade..82552fa4b343 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsics.td
+++ b/lib/Target/MBlaze/MBlazeIntrinsics.td
@@ -21,11 +21,11 @@ let TargetPrefix = "mblaze", isTarget = 1 in {
                                         [llvm_i32_ty],
                                         [IntrWriteMem]>;
 
-  class MBFSL_Put_Intrinsic : Intrinsic<[llvm_void_ty],
+  class MBFSL_Put_Intrinsic : Intrinsic<[],
                                         [llvm_i32_ty, llvm_i32_ty],
                                         [IntrWriteMem]>;
 
-  class MBFSL_PutT_Intrinsic : Intrinsic<[llvm_void_ty],
+  class MBFSL_PutT_Intrinsic : Intrinsic<[],
                                          [llvm_i32_ty],
                                          [IntrWriteMem]>;
 }
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
index ac41cc8d0311..15d16ecfe9d3 100644
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -424,7 +424,7 @@ void MSILWriter::printPtrLoad(uint64_t N) {
   case Module::Pointer32:
     printSimpleInstruction("ldc.i4",utostr(N).c_str());
     // FIXME: Need overflow test?
-    if (!isUInt32(N)) {
+    if (!isUInt<32>(N)) {
       errs() << "Value = " << utostr(N) << '\n';
       llvm_unreachable("32-bit pointer overflowed");
     }
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
index 32c6b0435a7a..f4d7d8a52886 100644
--- a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
@@ -59,7 +59,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
   }
 
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
 
 MCSymbol *MSP430MCInstLower::
@@ -75,7 +75,7 @@ GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
   }
 
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
 
 MCOperand MSP430MCInstLower::
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 63724823f56d..e584770dd497 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -173,6 +173,8 @@ unsigned MSP430InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     if (I->getOpcode() != MSP430::JMP &&
         I->getOpcode() != MSP430::JCC)
       break;
@@ -241,6 +243,9 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
+
     // Working from the bottom, when we see a non-terminator
     // instruction, we're done.
     if (!isUnpredicatedTerminator(I))
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index fb9370607499..1d5c51164e3f 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -235,10 +235,7 @@ std::string Mangler::getNameWithPrefix(const GlobalValue *GV,
 MCSymbol *Mangler::getSymbol(const GlobalValue *GV) {
   SmallString<60> NameStr;
   getNameWithPrefix(NameStr, GV, false);
-  if (!GV->hasPrivateLinkage())
-    return Context.GetOrCreateSymbol(NameStr.str());
-  
-  return Context.GetOrCreateTemporarySymbol(NameStr.str());
+  return Context.GetOrCreateSymbol(NameStr.str());
 }
 
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index fa4518d732f1..e948917eb80e 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -26,8 +26,9 @@
 // Floating Point Compare and Branch
 def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisSameAs<0, 2>, SDTCisInt<0>,
                                      SDTCisVT<1, OtherVT>]>;
-def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<0>, 
-                                  SDTCisInt<2>]>;
+def SDT_MipsFPCmp : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisFP<1>, 
+                                         SDTCisInt<3>]>;
 def SDT_MipsFPSelectCC : SDTypeProfile<1, 4, [SDTCisInt<1>, SDTCisInt<4>,
                                   SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
 
@@ -244,12 +245,13 @@ def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 /// Floating Point Compare
 let hasDelaySlot = 1, Defs=[FCR31] in {
   def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
-      "c.$cc.s $fs, $ft", [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc), 
-      (implicit FCR31)]>;
+      "c.$cc.s $fs, $ft",
+        [(set FCR31, (MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc))]>; 
   
   def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
-      "c.$cc.d $fs, $ft", [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc),
-      (implicit FCR31)]>, Requires<[In32BitMode]>;
+      "c.$cc.d $fs, $ft",
+       [(set FCR31, (MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc))]>,
+      Requires<[In32BitMode]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 1a9bffc24f88..85cf064b9f52 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -433,7 +433,15 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
   
   // Get the last instruction in the block.
@@ -562,6 +570,11 @@ RemoveBranch(MachineBasicBlock &MBB) const
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (I->getOpcode() != Mips::J && 
       GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
     return 0;
diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
index 2fb405e947ff..da16e8383c31 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.cpp
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp
@@ -226,6 +226,11 @@ bool PIC16InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // Get the terminator instruction.
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return true;
+    --I;
+  }
   // Handle unconditional branches. If the unconditional branch's target is
   // successor basic block then remove the unconditional branch. 
   if (I->getOpcode() == PIC16::br_uncond  && AllowModify) {
diff --git a/lib/Target/PIC16/PIC16Section.cpp b/lib/Target/PIC16/PIC16Section.cpp
index a96ebb8d8e34..2505b111f1e8 100644
--- a/lib/Target/PIC16/PIC16Section.cpp
+++ b/lib/Target/PIC16/PIC16Section.cpp
@@ -17,10 +17,9 @@ using namespace llvm;
 
 // This is the only way to create a PIC16Section. Sections created here
 // do not need to be explicitly deleted as they are managed by auto_ptrs.
-PIC16Section *PIC16Section::Create(const StringRef &Name,
-                                   PIC16SectionType Ty,
-                                   const std::string &Address, 
-                                   int Color, MCContext &Ctx) {
+PIC16Section *PIC16Section::Create(StringRef Name, PIC16SectionType Ty,
+                                   StringRef Address, int Color,
+                                   MCContext &Ctx) {
 
   /// Determine the internal SectionKind info.
   /// Users of PIC16Section class should not need to know the internal
@@ -59,8 +58,17 @@ PIC16Section *PIC16Section::Create(const StringRef &Name,
       
   }
 
+  // Copy strings into context allocated memory so they get free'd when the
+  // context is destroyed.
+  char *NameCopy = static_cast<char*>(Ctx.Allocate(Name.size(), 1));
+  memcpy(NameCopy, Name.data(), Name.size());
+  char *AddressCopy = static_cast<char*>(Ctx.Allocate(Address.size(), 1));
+  memcpy(AddressCopy, Address.data(), Address.size());
+
   // Create the Section.
-  PIC16Section *S = new (Ctx) PIC16Section(Name, K, Address, Color);
+  PIC16Section *S =
+    new (Ctx) PIC16Section(StringRef(NameCopy, Name.size()), K,
+                           StringRef(AddressCopy, Address.size()), Color);
   S->T = Ty;
   return S;
 }
diff --git a/lib/Target/PIC16/PIC16Section.h b/lib/Target/PIC16/PIC16Section.h
index 566f9207dbcd..9039ca781a30 100644
--- a/lib/Target/PIC16/PIC16Section.h
+++ b/lib/Target/PIC16/PIC16Section.h
@@ -30,11 +30,11 @@ namespace llvm {
     PIC16SectionType T;
 
     /// Name of the section to uniquely identify it.
-    std::string Name;
+    StringRef Name;
 
     /// User can specify an address at which a section should be placed. 
     /// Negative value here means user hasn't specified any. 
-    std::string Address; 
+    StringRef Address; 
 
     /// Overlay information - Sections with same color can be overlaid on
     /// one another.
@@ -43,17 +43,16 @@ namespace llvm {
     /// Total size of all data objects contained here.
     unsigned Size;
     
-    PIC16Section(const StringRef &name, SectionKind K, const std::string &addr, 
-                 int color)
+    PIC16Section(StringRef name, SectionKind K, StringRef addr, int color)
       : MCSection(K), Name(name), Address(addr), Color(color), Size(0) {
     }
     
   public:
     /// Return the name of the section.
-    const std::string &getName() const { return Name; }
+    StringRef getName() const { return Name; }
 
     /// Return the Address of the section.
-    const std::string &getAddress() const { return Address; }
+    StringRef getAddress() const { return Address; }
 
     /// Return the Color of the section.
     int getColor() const { return Color; }
@@ -64,6 +63,8 @@ namespace llvm {
     void setSize(unsigned size) { Size = size; }
 
     /// Conatined data objects.
+    // FIXME: This vector is leaked because sections are allocated with a
+    //        BumpPtrAllocator.
     std::vector<const GlobalVariable *>Items;
 
     /// Check section type. 
@@ -77,8 +78,8 @@ namespace llvm {
     PIC16SectionType getType() const { return T; }
 
     /// This would be the only way to create a section. 
-    static PIC16Section *Create(const StringRef &Name, PIC16SectionType Ty, 
-                                const std::string &Address, int Color, 
+    static PIC16Section *Create(StringRef Name, PIC16SectionType Ty, 
+                                StringRef Address, int Color, 
                                 MCContext &Ctx);
     
     /// Override this as PIC16 has its own way of printing switching
diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
index ed6fc9d51cb2..5adefd39c51b 100644
--- a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
@@ -309,8 +309,8 @@ namespace {
       const MCSymbol *&TOCEntry = TOC[Sym];
       if (TOCEntry == 0)
         TOCEntry = OutContext.
-          GetOrCreateTemporarySymbol(StringRef(MAI->getPrivateGlobalPrefix()) +
-                                     "C" + Twine(LabelID++));
+          GetOrCreateSymbol(StringRef(MAI->getPrivateGlobalPrefix()) +
+                            "C" + Twine(LabelID++));
 
       O << *TOCEntry << "@toc";
     }
@@ -674,14 +674,14 @@ static const MCSymbol *GetLazyPtr(const MCSymbol *Sym, MCContext &Ctx) {
   // Remove $stub suffix, add $lazy_ptr.
   SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end()-5);
   TmpStr += "$lazy_ptr";
-  return Ctx.GetOrCreateTemporarySymbol(TmpStr.str());
+  return Ctx.GetOrCreateSymbol(TmpStr.str());
 }
 
 static const MCSymbol *GetAnonSym(const MCSymbol *Sym, MCContext &Ctx) {
   // Add $tmp suffix to $stub, yielding $stub$tmp.
   SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end());
   TmpStr += "$tmp";
-  return Ctx.GetOrCreateTemporarySymbol(TmpStr.str());
+  return Ctx.GetOrCreateSymbol(TmpStr.str());
 }
 
 void PPCDarwinAsmPrinter::
@@ -811,6 +811,11 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
         OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
       else
         // Internal to current translation unit.
+        //
+        // When we place the LSDA into the TEXT section, the type info pointers
+        // need to be indirect and pc-rel. We accomplish this by using NLPs.
+        // However, sometimes the types are local to the file. So we need to
+        // fill in the value for the NLP in those cases.
         OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                       OutContext),
                               isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index a752421580d4..52948c868b9c 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -130,7 +130,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
         }
 
         // If this branch is in range, ignore it.
-        if (isInt16(BranchSize)) {
+        if (isInt<16>(BranchSize)) {
           MBBStartOffset += 4;
           continue;
         }
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 9d79c0ddb1c8..4f88d35deaf8 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -470,11 +470,11 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     if (CC == ISD::SETEQ || CC == ISD::SETNE) {
       if (isInt32Immediate(RHS, Imm)) {
         // SETEQ/SETNE comparison with 16-bit immediate, fold it.
-        if (isUInt16(Imm))
+        if (isUInt<16>(Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         // If this is a 16-bit signed immediate, fold it.
-        if (isInt16((int)Imm))
+        if (isInt<16>((int)Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         
@@ -494,7 +494,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
       }
       Opc = PPC::CMPLW;
     } else if (ISD::isUnsignedIntSetCC(CC)) {
-      if (isInt32Immediate(RHS, Imm) && isUInt16(Imm))
+      if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
         return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
                                               getI32Imm(Imm & 0xFFFF)), 0);
       Opc = PPC::CMPLW;
@@ -511,11 +511,11 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     if (CC == ISD::SETEQ || CC == ISD::SETNE) {
       if (isInt64Immediate(RHS.getNode(), Imm)) {
         // SETEQ/SETNE comparison with 16-bit immediate, fold it.
-        if (isUInt16(Imm))
+        if (isUInt<16>(Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         // If this is a 16-bit signed immediate, fold it.
-        if (isInt16(Imm))
+        if (isInt<16>(Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         
@@ -528,7 +528,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
         //   xoris r0,r3,0x1234
         //   cmpldi cr0,r0,0x5678
         //   beq cr0,L6
-        if (isUInt32(Imm)) {
+        if (isUInt<32>(Imm)) {
           SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
                                              getI64Imm(Imm >> 16)), 0);
           return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
@@ -537,7 +537,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
       }
       Opc = PPC::CMPLD;
     } else if (ISD::isUnsignedIntSetCC(CC)) {
-      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt16(Imm))
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
         return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
                                               getI64Imm(Imm & 0xFFFF)), 0);
       Opc = PPC::CMPLD;
@@ -761,12 +761,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       unsigned Shift = 0;
       
       // If it can't be represented as a 32 bit value.
-      if (!isInt32(Imm)) {
+      if (!isInt<32>(Imm)) {
         Shift = CountTrailingZeros_64(Imm);
         int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
         
         // If the shifted value fits 32 bits.
-        if (isInt32(ImmSh)) {
+        if (isInt<32>(ImmSh)) {
           // Go with the shifted value.
           Imm = ImmSh;
         } else {
@@ -785,7 +785,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       unsigned Hi = (Imm >> 16) & 0xFFFF;
       
       // Simple value.
-      if (isInt16(Imm)) {
+      if (isInt<16>(Imm)) {
        // Just the Lo bits.
         Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
       } else if (Lo) {
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 2c072c1290f6..e67666d80481 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -5539,8 +5539,16 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   return false;
 }
 
-EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                           bool isSrcConst, bool isSrcStr,
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove lowering.
+/// If DstAlign is zero that means it's safe to destination alignment can
+/// satisfy any constraint. Similarly if SrcAlign is zero it means there
+/// isn't a need to check it against alignment requirement, probably because
+/// the source does not need to be loaded. It returns EVT::Other if
+/// SelectionDAG should be responsible for determining it.
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool SafeToUseFP,
                                            SelectionDAG &DAG) const {
   if (this->PPCSubTarget.isPPC64()) {
     return MVT::i64;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 9c390ac10145..19fefab2d3a0 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -347,9 +347,16 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     
-    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                    bool isSrcConst, bool isSrcStr,
-                                    SelectionDAG &DAG) const;
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove lowering.
+    /// If DstAlign is zero that means it's safe to destination alignment can
+    /// satisfy any constraint. Similarly if SrcAlign is zero it means there
+    /// isn't a need to check it against alignment requirement, probably because
+    /// the source does not need to be loaded. It returns EVT::Other if
+    /// SelectionDAG should be responsible for determining it.
+    virtual EVT getOptimalMemOpType(uint64_t Size,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool SafeToUseFP, SelectionDAG &DAG) const;
 
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 3ff8f27ddb4d..256370fa5f52 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -15,6 +15,10 @@
 // Altivec transformation functions and pattern fragments.
 //
 
+// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
+// of that type.
+def vnot_ppc : PatFrag<(ops node:$in),
+                       (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
 
 def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
@@ -321,7 +325,8 @@ def VAND : VXForm_1<1028, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA), VRRC:$vB))]>;
 def VANDC : VXForm_1<1092, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                      "vandc $vD, $vA, $vB", VecFP,
-                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA), (vnot VRRC:$vB)))]>;
+                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA),
+                                          (vnot_ppc VRRC:$vB)))]>;
 
 def VCFSX  : VXForm_1<842, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
                       "vcfsx $vD, $vB, $UIMM", VecFP,
@@ -435,7 +440,8 @@ def VSUM4UBS: VX1_Int<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs>;
 
 def VNOR : VXForm_1<1284, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                     "vnor $vD, $vA, $vB", VecFP,
-                    [(set VRRC:$vD, (vnot (or (v4i32 VRRC:$vA), VRRC:$vB)))]>;
+                    [(set VRRC:$vD, (vnot_ppc (or (v4i32 VRRC:$vA),
+                                                  VRRC:$vB)))]>;
 def VOR : VXForm_1<1156, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                       "vor $vD, $vA, $vB", VecFP,
                       [(set VRRC:$vD, (or (v4i32 VRRC:$vA), VRRC:$vB))]>;
@@ -640,12 +646,11 @@ def:Pat<(vmrghw_unary_shuffle (v16i8 VRRC:$vA), undef),
         (VMRGHW VRRC:$vA, VRRC:$vA)>;
 
 // Logical Operations
-def : Pat<(v4i32 (vnot VRRC:$vA)),      (VNOR VRRC:$vA, VRRC:$vA)>;
-def : Pat<(v4i32 (vnot_conv VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>;
+def : Pat<(v4i32 (vnot_ppc VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>;
 
-def : Pat<(v4i32 (vnot_conv (or VRRC:$A, VRRC:$B))),
+def : Pat<(v4i32 (vnot_ppc (or VRRC:$A, VRRC:$B))),
           (VNOR VRRC:$A, VRRC:$B)>;
-def : Pat<(v4i32 (and VRRC:$A, (vnot_conv VRRC:$B))),
+def : Pat<(v4i32 (and VRRC:$A, (vnot_ppc VRRC:$B))),
           (VANDC VRRC:$A, VRRC:$B)>;
 
 def : Pat<(fmul VRRC:$vA, VRRC:$vB),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9895bea1223a..82c637efaf25 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -213,7 +213,15 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                  bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -281,6 +289,11 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC)
     return 0;
   
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6e7880e22ce2..44c5fe672f86 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -512,7 +512,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       MachineInstr *MI = I;
       DebugLoc dl = MI->getDebugLoc();
 
-      if (isInt16(CalleeAmt)) {
+      if (isInt<16>(CalleeAmt)) {
         BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg).addReg(StackReg).
           addImm(CalleeAmt);
       } else {
@@ -596,7 +596,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
   else
     Reg = PPC::R0;
   
-  if (MaxAlign < TargetAlign && isInt16(FrameSize)) {
+  if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
     BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
       .addReg(PPC::R31)
       .addImm(FrameSize);
@@ -798,7 +798,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // clear can be encoded.  This is extremely uncommon, because normally you
   // only "std" to a stack slot that is at least 4-byte aligned, but it can
   // happen in invalid code.
-  if (isInt16(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
+  if (isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
     if (isIXAddr)
       Offset >>= 2;    // The actual encoded value has the low two bits zero.
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
@@ -1375,8 +1375,9 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
   if (!isPPC64) {
     // PPC32.
     if (ALIGN_STACK && MaxAlign > TargetAlign) {
-      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
-      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+             "Invalid alignment!");
+      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
 
       BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
         .addReg(PPC::R1)
@@ -1390,7 +1391,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
         .addReg(PPC::R1)
         .addReg(PPC::R1)
         .addReg(PPC::R0);
-    } else if (isInt16(NegFrameSize)) {
+    } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
         .addReg(PPC::R1)
         .addImm(NegFrameSize)
@@ -1408,8 +1409,9 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
     }
   } else {    // PPC64.
     if (ALIGN_STACK && MaxAlign > TargetAlign) {
-      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
-      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+             "Invalid alignment!");
+      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
 
       BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
         .addReg(PPC::X1)
@@ -1422,7 +1424,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
         .addReg(PPC::X1)
         .addReg(PPC::X1)
         .addReg(PPC::X0);
-    } else if (isInt16(NegFrameSize)) {
+    } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
         .addReg(PPC::X1)
         .addImm(NegFrameSize / 4)
@@ -1591,7 +1593,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
       // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
       // call which invalidates the stack pointer value in SP(0). So we use the
       // value of R31 in this case.
-      if (FI->hasFastCall() && isInt16(FrameSize)) {
+      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
         assert(hasFP(MF) && "Expecting a valid the frame pointer.");
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
           .addReg(PPC::R31).addImm(FrameSize);
@@ -1605,7 +1607,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
           .addReg(PPC::R1)
           .addReg(PPC::R31)
           .addReg(PPC::R0);
-      } else if (isInt16(FrameSize) &&
+      } else if (isInt<16>(FrameSize) &&
                  (!ALIGN_STACK || TargetAlign >= MaxAlign) &&
                  !MFI->hasVarSizedObjects()) {
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
@@ -1615,7 +1617,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
           .addImm(0).addReg(PPC::R1);
       }
     } else {
-      if (FI->hasFastCall() && isInt16(FrameSize)) {
+      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
         assert(hasFP(MF) && "Expecting a valid the frame pointer.");
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
           .addReg(PPC::X31).addImm(FrameSize);
@@ -1629,7 +1631,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
           .addReg(PPC::X1)
           .addReg(PPC::X31)
           .addReg(PPC::X0);
-      } else if (isInt16(FrameSize) && TargetAlign >= MaxAlign &&
+      } else if (isInt<16>(FrameSize) && TargetAlign >= MaxAlign &&
             !MFI->hasVarSizedObjects()) {
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
            .addReg(PPC::X1).addImm(FrameSize);
@@ -1678,7 +1680,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
      unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS;
      unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI;
 
-     if (CallerAllocatedAmt && isInt16(CallerAllocatedAmt)) {
+     if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
        BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
          .addReg(StackReg).addImm(CallerAllocatedAmt);
      } else {
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index f46840c46c6a..8c5e9058561a 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -316,19 +316,19 @@ def FBCONVF64   : Pseudo<(outs FP64:$dst), (ins GR64:$src),
 let Defs = [PSW] in {
 def FCMP32rr : Pseudo<(outs), (ins FP32:$src1, FP32:$src2),
                       "cebr\t$src1, $src2",
-                      [(SystemZcmp FP32:$src1, FP32:$src2), (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP32:$src1, FP32:$src2))]>;
 def FCMP64rr : Pseudo<(outs), (ins FP64:$src1, FP64:$src2),
                       "cdbr\t$src1, $src2",
-                      [(SystemZcmp FP64:$src1, FP64:$src2), (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP64:$src1, FP64:$src2))]>;
 
 def FCMP32rm : Pseudo<(outs), (ins FP32:$src1, rriaddr12:$src2),
                       "ceb\t$src1, $src2",
-                      [(SystemZcmp FP32:$src1, (load rriaddr12:$src2)),
-                       (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP32:$src1,
+                                             (load rriaddr12:$src2)))]>;
 def FCMP64rm : Pseudo<(outs), (ins FP64:$src1, rriaddr12:$src2),
                       "cdb\t$src1, $src2",
-                      [(SystemZcmp FP64:$src1, (load rriaddr12:$src2)),
-                       (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP64:$src1,
+                                             (load rriaddr12:$src2)))]>;
 } // Defs = [PSW]
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 5fa7e8cec172..06f01e7b32b8 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -424,6 +424,8 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     // Working from the bottom, when we see a non-terminator
     // instruction, we're done.
     if (!isUnpredicatedTerminator(I))
@@ -500,6 +502,8 @@ unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     if (I->getOpcode() != SystemZ::JMP &&
         getCondFromBranchOpc(I->getOpcode()) == SystemZCC::INVALID)
       break;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 0d1af23cba14..22bde4ee7df2 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -31,7 +31,8 @@ class SDTCisI64<int OpNum> : SDTCisVT<OpNum, i64>;
 def SDT_SystemZCall         : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_SystemZCallSeqStart : SDCallSeqStart<[SDTCisI64<0>]>;
 def SDT_SystemZCallSeqEnd   : SDCallSeqEnd<[SDTCisI64<0>, SDTCisI64<1>]>;
-def SDT_CmpTest             : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_CmpTest             : SDTypeProfile<1, 2, [SDTCisI64<0>,
+                                                   SDTCisSameAs<1, 2>]>;
 def SDT_BrCond              : SDTypeProfile<0, 3,
                                            [SDTCisVT<0, OtherVT>,
                                             SDTCisI8<1>, SDTCisVT<2, i64>]>;
@@ -980,100 +981,89 @@ let Defs = [PSW] in {
 def CMP32rr : RRI<0x19,
                   (outs), (ins GR32:$src1, GR32:$src2),
                   "cr\t$src1, $src2",
-                  [(SystemZcmp GR32:$src1, GR32:$src2), 
-                   (implicit PSW)]>;
+                  [(set PSW, (SystemZcmp GR32:$src1, GR32:$src2))]>; 
 def CMP64rr : RREI<0xB920,
                    (outs), (ins GR64:$src1, GR64:$src2),
                    "cgr\t$src1, $src2",
-                   [(SystemZcmp GR64:$src1, GR64:$src2), 
-                    (implicit PSW)]>;
+                   [(set PSW, (SystemZcmp GR64:$src1, GR64:$src2))]>;
 
 def CMP32ri   : RILI<0xC2D,
                      (outs), (ins GR32:$src1, s32imm:$src2),
                      "cfi\t$src1, $src2",
-                     [(SystemZcmp GR32:$src1, imm:$src2), 
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZcmp GR32:$src1, imm:$src2))]>;
 def CMP64ri32 : RILI<0xC2C,
                      (outs), (ins GR64:$src1, s32imm64:$src2),
                      "cgfi\t$src1, $src2",
-                     [(SystemZcmp GR64:$src1, i64immSExt32:$src2),
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZcmp GR64:$src1, i64immSExt32:$src2))]>;
 
 def CMP32rm : RXI<0x59,
                   (outs), (ins GR32:$src1, rriaddr12:$src2),
                   "c\t$src1, $src2",
-                  [(SystemZcmp GR32:$src1, (load rriaddr12:$src2)),
-                   (implicit PSW)]>;
+                  [(set PSW, (SystemZcmp GR32:$src1, (load rriaddr12:$src2)))]>;
 def CMP32rmy : RXYI<0xE359,
                     (outs), (ins GR32:$src1, rriaddr:$src2),
                     "cy\t$src1, $src2",
-                    [(SystemZcmp GR32:$src1, (load rriaddr:$src2)),
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZcmp GR32:$src1, (load rriaddr:$src2)))]>;
 def CMP64rm  : RXYI<0xE320,
                     (outs), (ins GR64:$src1, rriaddr:$src2),
                     "cg\t$src1, $src2",
-                    [(SystemZcmp GR64:$src1, (load rriaddr:$src2)),
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZcmp GR64:$src1, (load rriaddr:$src2)))]>;
 
 def UCMP32rr : RRI<0x15,
                    (outs), (ins GR32:$src1, GR32:$src2),
                    "clr\t$src1, $src2",
-                   [(SystemZucmp GR32:$src1, GR32:$src2),
-                    (implicit PSW)]>;
+                   [(set PSW, (SystemZucmp GR32:$src1, GR32:$src2))]>;
 def UCMP64rr : RREI<0xB921,
                     (outs), (ins GR64:$src1, GR64:$src2),
                     "clgr\t$src1, $src2",
-                    [(SystemZucmp GR64:$src1, GR64:$src2), 
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZucmp GR64:$src1, GR64:$src2))]>;
 
 def UCMP32ri   : RILI<0xC2F,
                       (outs), (ins GR32:$src1, i32imm:$src2),
                       "clfi\t$src1, $src2",
-                      [(SystemZucmp GR32:$src1, imm:$src2),
-                       (implicit PSW)]>;
+                      [(set PSW, (SystemZucmp GR32:$src1, imm:$src2))]>;
 def UCMP64ri32 : RILI<0xC2E,
                       (outs), (ins GR64:$src1, i64i32imm:$src2),
                       "clgfi\t$src1, $src2",
-                      [(SystemZucmp GR64:$src1, i64immZExt32:$src2),
-                       (implicit PSW)]>;
+                      [(set PSW,(SystemZucmp GR64:$src1, i64immZExt32:$src2))]>;
 
 def UCMP32rm  : RXI<0x55,
                     (outs), (ins GR32:$src1, rriaddr12:$src2),
                     "cl\t$src1, $src2",
-                    [(SystemZucmp GR32:$src1, (load rriaddr12:$src2)),
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZucmp GR32:$src1,
+                                            (load rriaddr12:$src2)))]>;
 def UCMP32rmy : RXYI<0xE355,
                      (outs), (ins GR32:$src1, rriaddr:$src2),
                      "cly\t$src1, $src2",
-                     [(SystemZucmp GR32:$src1, (load rriaddr:$src2)),
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZucmp GR32:$src1,
+                                             (load rriaddr:$src2)))]>;
 def UCMP64rm  : RXYI<0xE351,
                      (outs), (ins GR64:$src1, rriaddr:$src2),
                      "clg\t$src1, $src2",
-                     [(SystemZucmp GR64:$src1, (load rriaddr:$src2)),
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZucmp GR64:$src1,
+                                             (load rriaddr:$src2)))]>;
 
 def CMPSX64rr32  : RREI<0xB930,
                         (outs), (ins GR64:$src1, GR32:$src2),
                         "cgfr\t$src1, $src2",
-                        [(SystemZucmp GR64:$src1, (sext GR32:$src2)),
-                         (implicit PSW)]>;
+                        [(set PSW, (SystemZucmp GR64:$src1,
+                                                (sext GR32:$src2)))]>;
 def UCMPZX64rr32 : RREI<0xB931,
                         (outs), (ins GR64:$src1, GR32:$src2),
                         "clgfr\t$src1, $src2",
-                        [(SystemZucmp GR64:$src1, (zext GR32:$src2)),
-                         (implicit PSW)]>;
+                        [(set PSW, (SystemZucmp GR64:$src1,
+                                                (zext GR32:$src2)))]>;
 
 def CMPSX64rm32   : RXYI<0xE330,
                          (outs), (ins GR64:$src1, rriaddr:$src2),
                          "cgf\t$src1, $src2",
-                         [(SystemZucmp GR64:$src1, (sextloadi64i32 rriaddr:$src2)),
-                          (implicit PSW)]>;
+                         [(set PSW, (SystemZucmp GR64:$src1,
+                                             (sextloadi64i32 rriaddr:$src2)))]>;
 def UCMPZX64rm32  : RXYI<0xE331,
                          (outs), (ins GR64:$src1, rriaddr:$src2),
                          "clgf\t$src1, $src2",
-                         [(SystemZucmp GR64:$src1, (zextloadi64i32 rriaddr:$src2)),
-                          (implicit PSW)]>;
+                         [(set PSW, (SystemZucmp GR64:$src1,
+                                             (zextloadi64i32 rriaddr:$src2)))]>;
 
 // FIXME: Add other crazy ucmp forms
 
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
index c3dcf8e66eab..66bb914edde5 100644
--- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -524,6 +524,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
           OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
         else
           // Internal to current translation unit.
+          //
+          // When we place the LSDA into the TEXT section, the type info
+          // pointers need to be indirect and pc-rel. We accomplish this by
+          // using NLPs.  However, sometimes the types are local to the file. So
+          // we need to fill in the value for the NLP in those cases.
           OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                         OutContext),
                                 4/*size*/, 0/*addrspace*/);
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
index 7d29d97ce0ee..c851ca3fc805 100644
--- a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
@@ -83,7 +83,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
     Name += "$non_lazy_ptr";
-    MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
 
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getGVStubEntry(Sym);
@@ -98,7 +98,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   }
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
     Name += "$non_lazy_ptr";
-    MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getHiddenGVStubEntry(Sym);
     if (StubSym.getPointer() == 0) {
@@ -112,7 +112,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   }
   case X86II::MO_DARWIN_STUB: {
     Name += "$stub";
-    MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getFnStubEntry(Sym);
     if (StubSym.getPointer())
@@ -127,7 +127,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       Name.erase(Name.end()-5, Name.end());
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Ctx.GetOrCreateTemporarySymbol(Name.str()), false);
+        StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
     }
     return Sym;
   }
@@ -287,7 +287,9 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break;
   case X86::FsFLD0SS:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::FsFLD0SD:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
-  case X86::V_SET0:       LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
+  case X86::V_SET0PS:     LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
+  case X86::V_SET0PD:     LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
+  case X86::V_SET0PI:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
 
   case X86::MOV16r0:
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 4d3dedf0e5f1..22285f193234 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(X86GenCallingConv.inc -gen-callingconv)
 tablegen(X86GenSubtarget.inc -gen-subtarget)
 
 set(sources
+  SSEDomainFix.cpp
   X86AsmBackend.cpp
   X86CodeEmitter.cpp
   X86COFFMachineModuleInfo.cpp
diff --git a/lib/Target/X86/SSEDomainFix.cpp b/lib/Target/X86/SSEDomainFix.cpp
new file mode 100644
index 000000000000..395ab5705ae2
--- /dev/null
+++ b/lib/Target/X86/SSEDomainFix.cpp
@@ -0,0 +1,499 @@
+//===- SSEDomainFix.cpp - Use proper int/float domain for SSE ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SSEDomainFix pass.
+//
+// Some SSE instructions like mov, and, or, xor are available in different
+// variants for different operand types. These variant instructions are
+// equivalent, but on Nehalem and newer cpus there is extra latency
+// transferring data between integer and floating point domains.
+//
+// This pass changes the variant instructions to minimize domain crossings.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sse-domain-fix"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+
+/// Allocate objects from a pool, allow objects to be recycled, and provide a
+/// way of deleting everything.
+template<typename T, unsigned PageSize = 64>
+class PoolAllocator {
+  std::vector<T*> Pages, Avail;
+public:
+  ~PoolAllocator() { Clear(); }
+
+  T* Alloc() {
+    if (Avail.empty()) {
+      T *p = new T[PageSize];
+      Pages.push_back(p);
+      Avail.reserve(PageSize);
+      for (unsigned n = 0; n != PageSize; ++n)
+        Avail.push_back(p+n);
+    }
+    T *p = Avail.back();
+    Avail.pop_back();
+    return p;
+  }
+
+  // Allow object to be reallocated. It won't be reconstructed.
+  void Recycle(T *p) {
+    p->clear();
+    Avail.push_back(p);
+  }
+
+  // Destroy all objects, make sure there are no external pointers to them.
+  void Clear() {
+    Avail.clear();
+    while (!Pages.empty()) {
+      delete[] Pages.back();
+      Pages.pop_back();
+    }
+  }
+};
+
+/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
+/// of execution domains.
+///
+/// An open DomainValue represents a set of instructions that can still switch
+/// execution domain. Multiple registers may refer to the same open
+/// DomainValue - they will eventually be collapsed to the same execution
+/// domain.
+///
+/// A collapsed DomainValue represents a single register that has been forced
+/// into one of more execution domains. There is a separate collapsed
+/// DomainValue for each register, but it may contain multiple execution
+/// domains. A register value is initially created in a single execution
+/// domain, but if we were forced to pay the penalty of a domain crossing, we
+/// keep track of the fact the the register is now available in multiple
+/// domains.
+struct DomainValue {
+  // Basic reference counting.
+  unsigned Refs;
+
+  // Available domains. For an open DomainValue, it is the still possible
+  // domains for collapsing. For a collapsed DomainValue it is the domains where
+  // the register is available for free.
+  unsigned Mask;
+
+  // Position of the last defining instruction.
+  unsigned Dist;
+
+  // Twiddleable instructions using or defining these registers.
+  SmallVector<MachineInstr*, 8> Instrs;
+
+  // Collapsed DomainValue have no instructions to twiddle - it simply keeps
+  // track of the domains where the registers are already available.
+  bool collapsed() const { return Instrs.empty(); }
+
+  // Is any domain in mask available?
+  bool compat(unsigned mask) const {
+    return Mask & mask;
+  }
+
+  // Mark domain as available.
+  void add(unsigned domain) {
+    Mask |= 1u << domain;
+  }
+
+  // First domain available in mask.
+  unsigned firstDomain() const {
+    return CountTrailingZeros_32(Mask);
+  }
+
+  DomainValue() { clear(); }
+
+  void clear() {
+    Refs = Mask = Dist = 0;
+    Instrs.clear();
+  }
+};
+
+static const unsigned NumRegs = 16;
+
+class SSEDomainFixPass : public MachineFunctionPass {
+  static char ID;
+  PoolAllocator<DomainValue> Pool;
+
+  MachineFunction *MF;
+  const X86InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineBasicBlock *MBB;
+  DomainValue **LiveRegs;
+  typedef DenseMap<MachineBasicBlock*,DomainValue**> LiveOutMap;
+  LiveOutMap LiveOuts;
+  unsigned Distance;
+
+public:
+  SSEDomainFixPass() : MachineFunctionPass(&ID) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "SSE execution domain fixup";
+  }
+
+private:
+  // Register mapping.
+  int RegIndex(unsigned Reg);
+
+  // LiveRegs manipulations.
+  void SetLiveReg(int rx, DomainValue *DV);
+  void Kill(int rx);
+  void Force(int rx, unsigned domain);
+  void Collapse(DomainValue *dv, unsigned domain);
+  bool Merge(DomainValue *A, DomainValue *B);
+
+  void enterBasicBlock();
+  void visitGenericInstr(MachineInstr*);
+  void visitSoftInstr(MachineInstr*, unsigned mask);
+  void visitHardInstr(MachineInstr*, unsigned domain);
+};
+}
+
+char SSEDomainFixPass::ID = 0;
+
+/// Translate TRI register number to an index into our smaller tables of
+/// interesting registers. Return -1 for boring registers.
+int SSEDomainFixPass::RegIndex(unsigned reg) {
+  // Registers are sorted lexicographically.
+  // We just need them to be consecutive, ordering doesn't matter.
+  assert(X86::XMM9 == X86::XMM0+NumRegs-1 && "Unexpected sort");
+  reg -= X86::XMM0;
+  return reg < NumRegs ? reg : -1;
+}
+
+/// Set LiveRegs[rx] = dv, updating reference counts.
+void SSEDomainFixPass::SetLiveReg(int rx, DomainValue *dv) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  if (!LiveRegs)
+    LiveRegs = (DomainValue**)calloc(sizeof(DomainValue*), NumRegs);
+
+  if (LiveRegs[rx] == dv)
+    return;
+  if (LiveRegs[rx]) {
+    assert(LiveRegs[rx]->Refs && "Bad refcount");
+    if (--LiveRegs[rx]->Refs == 0) Pool.Recycle(LiveRegs[rx]);
+  }
+  LiveRegs[rx] = dv;
+  if (dv) ++dv->Refs;
+}
+
+// Kill register rx, recycle or collapse any DomainValue.
+void SSEDomainFixPass::Kill(int rx) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  if (!LiveRegs || !LiveRegs[rx]) return;
+
+  // Before killing the last reference to an open DomainValue, collapse it to
+  // the first available domain.
+  if (LiveRegs[rx]->Refs == 1 && !LiveRegs[rx]->collapsed())
+    Collapse(LiveRegs[rx], LiveRegs[rx]->firstDomain());
+  else
+    SetLiveReg(rx, 0);
+}
+
+/// Force register rx into domain.
+void SSEDomainFixPass::Force(int rx, unsigned domain) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  DomainValue *dv;
+  if (LiveRegs && (dv = LiveRegs[rx])) {
+    if (dv->collapsed())
+      dv->add(domain);
+    else
+      Collapse(dv, domain);
+  } else {
+    // Set up basic collapsed DomainValue.
+    dv = Pool.Alloc();
+    dv->Dist = Distance;
+    dv->add(domain);
+    SetLiveReg(rx, dv);
+  }
+}
+
+/// Collapse open DomainValue into given domain. If there are multiple
+/// registers using dv, they each get a unique collapsed DomainValue.
+void SSEDomainFixPass::Collapse(DomainValue *dv, unsigned domain) {
+  assert(dv->compat(1u << domain) && "Cannot collapse");
+
+  // Collapse all the instructions.
+  while (!dv->Instrs.empty()) {
+    MachineInstr *mi = dv->Instrs.back();
+    TII->SetSSEDomain(mi, domain);
+    dv->Instrs.pop_back();
+  }
+  dv->Mask = 1u << domain;
+
+  // If there are multiple users, give them new, unique DomainValues.
+  if (LiveRegs && dv->Refs > 1) {
+    for (unsigned rx = 0; rx != NumRegs; ++rx)
+      if (LiveRegs[rx] == dv) {
+        DomainValue *dv2 = Pool.Alloc();
+        dv2->Dist = Distance;
+        dv2->add(domain);
+        SetLiveReg(rx, dv2);
+      }
+  }
+}
+
+/// Merge - All instructions and registers in B are moved to A, and B is
+/// released.
+bool SSEDomainFixPass::Merge(DomainValue *A, DomainValue *B) {
+  assert(!A->collapsed() && "Cannot merge into collapsed");
+  assert(!B->collapsed() && "Cannot merge from collapsed");
+  if (A == B)
+    return true;
+  if (!A->compat(B->Mask))
+    return false;
+  A->Mask &= B->Mask;
+  A->Dist = std::max(A->Dist, B->Dist);
+  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
+  for (unsigned rx = 0; rx != NumRegs; ++rx)
+    if (LiveRegs[rx] == B)
+      SetLiveReg(rx, A);
+  return true;
+}
+
+void SSEDomainFixPass::enterBasicBlock() {
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock::const_livein_iterator i = MBB->livein_begin(),
+         e = MBB->livein_end(); i != e; ++i) {
+    int rx = RegIndex(*i);
+    if (rx < 0) continue;
+    for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
+           pe = MBB->pred_end(); pi != pe; ++pi) {
+      LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
+      if (fi == LiveOuts.end()) continue;
+      DomainValue *pdv = fi->second[rx];
+      if (!pdv) continue;
+      if (!LiveRegs || !LiveRegs[rx]) {
+        SetLiveReg(rx, pdv);
+        continue;
+      }
+
+      // We have a live DomainValue from more than one predecessor.
+      if (LiveRegs[rx]->collapsed()) {
+        // We are already collapsed, but predecessor is not. Force him.
+        if (!pdv->collapsed())
+          Collapse(pdv, LiveRegs[rx]->firstDomain());
+        continue;
+      }
+      
+      // Currently open, merge in predecessor.
+      if (!pdv->collapsed())
+        Merge(LiveRegs[rx], pdv);
+      else
+        Collapse(LiveRegs[rx], pdv->firstDomain());
+    }
+  }
+}
+
+// A hard instruction only works in one domain. All input registers will be
+// forced into that domain.
+void SSEDomainFixPass::visitHardInstr(MachineInstr *mi, unsigned domain) {
+  // Collapse all uses.
+  for (unsigned i = mi->getDesc().getNumDefs(),
+                e = mi->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    Force(rx, domain);
+  }
+
+  // Kill all defs and force them.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    Kill(rx);
+    Force(rx, domain);
+  }
+}
+
+// A soft instruction can be changed to work in other domains given by mask.
+void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+  // Scan the explicit use operands for incoming domains.
+  unsigned collmask = mask;
+  SmallVector<int, 4> used;
+  if (LiveRegs)
+    for (unsigned i = mi->getDesc().getNumDefs(),
+                  e = mi->getDesc().getNumOperands(); i != e; ++i) {
+      MachineOperand &mo = mi->getOperand(i);
+      if (!mo.isReg()) continue;
+      int rx = RegIndex(mo.getReg());
+      if (rx < 0) continue;
+      if (DomainValue *dv = LiveRegs[rx]) {
+        // Is it possible to use this collapsed register for free?
+        if (dv->collapsed()) {
+          if (unsigned m = collmask & dv->Mask)
+            collmask = m;
+        } else if (dv->compat(collmask))
+          used.push_back(rx);
+        else
+          Kill(rx);
+      }
+    }
+
+  // If the collapsed operands force a single domain, propagate the collapse.
+  if (isPowerOf2_32(collmask)) {
+    unsigned domain = CountTrailingZeros_32(collmask);
+    TII->SetSSEDomain(mi, domain);
+    visitHardInstr(mi, domain);
+    return;
+  }
+
+  // Kill off any remaining uses that don't match collmask, and build a list of
+  // incoming DomainValue that we want to merge.
+  SmallVector<DomainValue*,4> doms;
+  for (SmallVector<int, 4>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
+    int rx = *i;
+    DomainValue *dv = LiveRegs[rx];
+    // This useless DomainValue could have been missed above.
+    if (!dv->compat(collmask)) {
+      Kill(*i);
+      continue;
+    }
+    // sorted, uniqued insert.
+    bool inserted = false;
+    for (SmallVector<DomainValue*,4>::iterator i = doms.begin(), e = doms.end();
+           i != e && !inserted; ++i) {
+      if (dv == *i)
+        inserted = true;
+      else if (dv->Dist < (*i)->Dist) {
+        inserted = true;
+        doms.insert(i, dv);
+      }
+    }
+    if (!inserted)
+      doms.push_back(dv);
+  }
+
+  //  doms are now sorted in order of appearance. Try to merge them all, giving
+  //  priority to the latest ones.
+  DomainValue *dv = 0;
+  while (!doms.empty()) {
+    if (!dv) {
+      dv = doms.pop_back_val();
+      continue;
+    }
+    
+    DomainValue *ThisDV = doms.pop_back_val();
+    if (Merge(dv, ThisDV)) continue;
+    
+    for (SmallVector<int,4>::iterator i=used.begin(), e=used.end(); i != e; ++i)
+      if (LiveRegs[*i] == ThisDV)
+        Kill(*i);
+  }
+
+  // dv is the DomainValue we are going to use for this instruction.
+  if (!dv)
+    dv = Pool.Alloc();
+  dv->Dist = Distance;
+  dv->Mask = collmask;
+  dv->Instrs.push_back(mi);
+
+  // Finally set all defs and non-collapsed uses to dv.
+  for (unsigned i = 0, e = mi->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    if (!LiveRegs || !LiveRegs[rx] || (mo.isDef() && LiveRegs[rx]!=dv)) {
+      Kill(rx);
+      SetLiveReg(rx, dv);
+    }
+  }
+}
+
+void SSEDomainFixPass::visitGenericInstr(MachineInstr *mi) {
+  // Process explicit defs, kill any XMM registers redefined.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    Kill(rx);
+  }
+}
+
+bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const X86InstrInfo*>(MF->getTarget().getInstrInfo());
+  TRI = MF->getTarget().getRegisterInfo();
+  MBB = 0;
+  LiveRegs = 0;
+  Distance = 0;
+  assert(NumRegs == X86::VR128RegClass.getNumRegs() && "Bad regclass");
+
+  // If no XMM registers are used in the function, we can skip it completely.
+  bool anyregs = false;
+  for (TargetRegisterClass::const_iterator I = X86::VR128RegClass.begin(),
+         E = X86::VR128RegClass.end(); I != E; ++I)
+    if (MF->getRegInfo().isPhysRegUsed(*I)) {
+      anyregs = true;
+      break;
+    }
+  if (!anyregs) return false;
+
+  MachineBasicBlock *Entry = MF->begin();
+  SmallPtrSet<MachineBasicBlock*, 16> Visited;
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 16> >
+         DFI = df_ext_begin(Entry, Visited), DFE = df_ext_end(Entry, Visited);
+         DFI != DFE; ++DFI) {
+    MBB = *DFI;
+    enterBasicBlock();
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+        ++I) {
+      MachineInstr *mi = I;
+      if (mi->isDebugValue()) continue;
+      ++Distance;
+      std::pair<uint16_t, uint16_t> domp = TII->GetSSEDomain(mi);
+      if (domp.first)
+        if (domp.second)
+          visitSoftInstr(mi, domp.second);
+        else
+          visitHardInstr(mi, domp.first);
+      else if (LiveRegs)
+        visitGenericInstr(mi);
+    }
+
+    // Save live registers at end of MBB - used by enterBasicBlock().
+    if (LiveRegs)
+      LiveOuts.insert(std::make_pair(MBB, LiveRegs));
+    LiveRegs = 0;
+  }
+
+  // Clear the LiveOuts vectors. Should we also collapse any remaining
+  // DomainValues?
+  for (LiveOutMap::const_iterator i = LiveOuts.begin(), e = LiveOuts.end();
+         i != e; ++i)
+    free(i->second);
+  LiveOuts.clear();
+  Pool.Clear();
+
+  return false;
+}
+
+FunctionPass *llvm::createSSEDomainFixPass() {
+  return new SSEDomainFixPass();
+}
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index c753cf2a530f..9be38a4b56a9 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -41,6 +41,10 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM,
 ///
 FunctionPass *createX86FloatingPointStackifierPass();
 
+/// createSSEDomainFixPass - This pass twiddles SSE opcodes to prevent domain
+/// crossings.
+FunctionPass *createSSEDomainFixPass();
+
 /// createX87FPRegKillInserterPass - This function returns a pass which
 /// inserts FP_REG_KILL instructions where needed.
 ///
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 2be51e1a1366..6b62795a6e14 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -59,6 +59,9 @@ def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       [FeatureCMOV]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
+def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
+                                        "IsUAMemFast", "true",
+                                        "Fast unaligned memory access">;
 def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
                                       "Support SSE 4a instructions">;
 
@@ -98,8 +101,10 @@ def : Proc<"nocona",          [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"core2",           [FeatureSSSE3,  Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"penryn",          [FeatureSSE41,  Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"atom",            [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
+                               FeatureFastUAMem]>;
+def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
+                               FeatureFastUAMem]>;
 // Sandy Bridge does not have FMA
 def : Proc<"sandybridge",     [FeatureSSE42,  FeatureAVX,   Feature64Bit]>;
 
@@ -160,10 +165,11 @@ def X86InstrInfo : InstrInfo {
                        "hasAdSizePrefix",
                        "Prefix",
                        "hasREX_WPrefix",
-                       "ImmTypeBits",
-                       "FPFormBits",
+                       "ImmT.Value",
+                       "FPForm.Value",
                        "hasLockPrefix",
                        "SegOvrBits",
+                       "ExeDomain.Value",
                        "Opcode"];
   let TSFlagsShifts = [0,
                        6,
@@ -174,6 +180,7 @@ def X86InstrInfo : InstrInfo {
                        16,
                        19,
                        20,
+                       22,
                        24];
 }
 
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
index 754a20097908..8e2928c3b713 100644
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -10,10 +10,14 @@
 #include "llvm/Target/TargetAsmBackend.h"
 #include "X86.h"
 #include "X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MachObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
@@ -48,8 +52,135 @@ public:
     for (unsigned i = 0; i != Size; ++i)
       DF.getContents()[Fixup.Offset + i] = uint8_t(Value >> (i * 8));
   }
+
+  bool MayNeedRelaxation(const MCInst &Inst,
+                         const SmallVectorImpl<MCAsmFixup> &Fixups) const;
+
+  void RelaxInstruction(const MCInstFragment *IF, MCInst &Res) const;
+
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
 };
 
+static unsigned getRelaxedOpcode(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+  case X86::JAE_1: return X86::JAE_4;
+  case X86::JA_1:  return X86::JA_4;
+  case X86::JBE_1: return X86::JBE_4;
+  case X86::JB_1:  return X86::JB_4;
+  case X86::JE_1:  return X86::JE_4;
+  case X86::JGE_1: return X86::JGE_4;
+  case X86::JG_1:  return X86::JG_4;
+  case X86::JLE_1: return X86::JLE_4;
+  case X86::JL_1:  return X86::JL_4;
+  case X86::JMP_1: return X86::JMP_4;
+  case X86::JNE_1: return X86::JNE_4;
+  case X86::JNO_1: return X86::JNO_4;
+  case X86::JNP_1: return X86::JNP_4;
+  case X86::JNS_1: return X86::JNS_4;
+  case X86::JO_1:  return X86::JO_4;
+  case X86::JP_1:  return X86::JP_4;
+  case X86::JS_1:  return X86::JS_4;
+  }
+}
+
+bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst,
+                              const SmallVectorImpl<MCAsmFixup> &Fixups) const {
+  // Check for a 1byte pcrel fixup, and enforce that we would know how to relax
+  // this instruction.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    if (unsigned(Fixups[i].Kind) == X86::reloc_pcrel_1byte) {
+      assert(getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::RelaxInstruction(const MCInstFragment *IF,
+                                     MCInst &Res) const {
+  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+  unsigned RelaxedOp = getRelaxedOpcode(IF->getInst().getOpcode());
+
+  if (RelaxedOp == IF->getInst().getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    IF->getInst().dump_pretty(OS);
+    llvm_report_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  Res = IF->getInst();
+  Res.setOpcode(RelaxedOp);
+}
+
+/// WriteNopData - Write optimal nops to the output file for the \arg Count
+/// bytes.  This returns the number of bytes written.  It may return 0 if
+/// the \arg Count is more than the maximum optimal nops.
+///
+/// FIXME this is X86 32-bit specific and should move to a better place.
+bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+  static const uint8_t Nops[16][16] = {
+    // nop
+    {0x90},
+    // xchg %ax,%ax
+    {0x66, 0x90},
+    // nopl (%[re]ax)
+    {0x0f, 0x1f, 0x00},
+    // nopl 0(%[re]ax)
+    {0x0f, 0x1f, 0x40, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw 0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw %cs:0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00,
+     0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
+     0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    // nopl 0L(%[re]ax) */
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
+     0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
+     0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
+     0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}
+  };
+
+  // Write an optimal sequence for the first 15 bytes.
+  uint64_t OptimalCount = (Count < 16) ? Count : 15;
+  for (uint64_t i = 0, e = OptimalCount; i != e; i++)
+    OW->Write8(Nops[OptimalCount - 1][i]);
+
+  // Finish with single byte nops.
+  for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
+   OW->Write8(0x90);
+
+  return true;
+}
+
+/* *** */
+
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   ELFX86AsmBackend(const Target &T)
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 5d3edbb9f9aa..c69eeb3e9006 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -383,7 +383,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
       uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
       // They have to fit in the 32-bit signed displacement field though.
-      if (isInt32(Disp)) {
+      if (isInt<32>(Disp)) {
         AM.Disp = (uint32_t)Disp;
         return X86SelectAddress(U->getOperand(0), AM);
       }
@@ -427,7 +427,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) {
       }
     }
     // Check for displacement overflow.
-    if (!isInt32(Disp))
+    if (!isInt<32>(Disp))
       break;
     // Ok, the GEP indices were covered by constant-offset and scaled-index
     // addressing. Update the address state and move on to examining the base.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 704f9c65a59e..b24d5a1707a0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -802,6 +802,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       if (!VT.is128BitVector()) {
         continue;
       }
+      
       setOperationAction(ISD::AND,    SVT, Promote);
       AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
       setOperationAction(ISD::OR,     SVT, Promote);
@@ -1008,7 +1009,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // FIXME: These should be based on subtarget info. Plus, the values should
   // be smaller when we are in optimizing for size mode.
   maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
-  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
   setPrefLoopAlignment(16);
   benefitFromCodePlacementOpt = true;
@@ -1066,23 +1067,37 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
 }
 
 /// getOptimalMemOpType - Returns the target specific optimal type for load
-/// and store operations as a result of memset, memcpy, and memmove
-/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
-/// determining it.
+/// and store operations as a result of memset, memcpy, and memmove lowering.
+/// If DstAlign is zero that means it's safe to destination alignment can
+/// satisfy any constraint. Similarly if SrcAlign is zero it means there
+/// isn't a need to check it against alignment requirement, probably because
+/// the source does not need to be loaded. It returns EVT::Other if
+/// SelectionDAG should be responsible for determining it.
 EVT
-X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                       bool isSrcConst, bool isSrcStr,
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+                                       unsigned DstAlign, unsigned SrcAlign,
+                                       bool SafeToUseFP,
                                        SelectionDAG &DAG) const {
   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
   // linux.  This is because the stack realignment code can't handle certain
   // cases like PR2962.  This should be removed when PR2962 is fixed.
   const Function *F = DAG.getMachineFunction().getFunction();
-  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
-  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
-    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
-      return MVT::v4i32;
-    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
-      return MVT::v4f32;
+  if (!F->hasFnAttr(Attribute::NoImplicitFloat)) {
+    if (Size >= 16 &&
+        (Subtarget->isUnalignedMemAccessFast() ||
+         ((DstAlign == 0 || DstAlign >= 16) &&
+          (SrcAlign == 0 || SrcAlign >= 16))) &&
+        Subtarget->getStackAlignment() >= 16) {
+      if (Subtarget->hasSSE2())
+        return MVT::v4i32;
+      if (SafeToUseFP && Subtarget->hasSSE1())
+        return MVT::v4f32;
+    } else if (SafeToUseFP &&
+               Size >= 8 &&
+               !Subtarget->is64Bit() &&
+               Subtarget->getStackAlignment() >= 8 &&
+               Subtarget->hasSSE2())
+      return MVT::f64;
   }
   if (Subtarget->is64Bit() && Size >= 8)
     return MVT::i64;
@@ -1108,8 +1123,8 @@ MCSymbol *
 X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
                                     MCContext &Ctx) const {
   const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
-  return Ctx.GetOrCreateTemporarySymbol(Twine(MAI.getPrivateGlobalPrefix())+
-                                        Twine(MF->getFunctionNumber())+"$pb");
+  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
+                               Twine(MF->getFunctionNumber())+"$pb");
 }
 
 
@@ -2290,6 +2305,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return false;
 
   // If -tailcallopt is specified, make fastcc functions tail-callable.
+  const MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = DAG.getMachineFunction().getFunction();
   if (GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) &&
@@ -2301,8 +2317,14 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // Look for obvious safe cases to perform tail call optimization that does not
   // requite ABI changes. This is what gcc calls sibcall.
 
-  // Do not sibcall optimize vararg calls for now.
-  if (isVarArg)
+  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+  // emit a special epilogue.
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing any
+  // arguments.
+  if (isVarArg && !Outs.empty())
     return false;
 
   // Also avoid sibcall optimization if either caller or callee uses struct
@@ -2417,7 +2439,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                        bool hasSymbolicDisplacement) {
   // Offset should fit into 32 bit immediate field.
-  if (!isInt32(Offset))
+  if (!isInt<32>(Offset))
     return false;
 
   // If we don't have a symbolic displacement - we don't have any extra
@@ -3613,6 +3635,69 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
   return SDValue();
 }
 
+/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 
+/// vector of type 'VT', see if the elements can be replaced by a single large 
+/// load which has the same value as a build_vector whose operands are 'elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+/// 
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+                                        DebugLoc &dl, SelectionDAG &DAG) {
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = Elts.size();
+  
+  LoadSDNode *LDBase = NULL;
+  unsigned LastLoadedElt = -1U;
+  
+  // For each element in the initializer, see if we've found a load or an undef.
+  // If we don't find an initial load element, or later load elements are 
+  // non-consecutive, bail out.
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Elts[i];
+    
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return SDValue();
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return SDValue();
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+      return SDValue();
+    LastLoadedElt = i;
+  }
+
+  // If we have found an entire vector of loads and undefs, then return a large
+  // load of the entire vector width starting at the base pointer.  If we found
+  // consecutive loads for the low half, generate a vzext_load node.
+  if (LastLoadedElt == NumElems - 1) {
+    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+      return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                         LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+    return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                       LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                       LDBase->isVolatile(), LDBase->isNonTemporal(),
+                       LDBase->getAlignment());
+  } else if (NumElems == 4 && LastLoadedElt == 1) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
+  }
+  return SDValue();
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
@@ -3841,14 +3926,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   }
 
-  if (Values.size() > 2) {
-    // If we have SSE 4.1, Expand into a number of inserts unless the number of
-    // values to be inserted is equal to the number of elements, in which case
-    // use the unpack code below in the hopes of matching the consecutive elts
-    // load merge pattern for shuffles.
-    // FIXME: We could probably just check that here directly.
-    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
-        getSubtarget()->hasSSE41()) {
+  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+    // Check for a build vector of consecutive loads.
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = Op.getOperand(i);
+    
+    // Check for elements which are consecutive loads.
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    if (LD.getNode())
+      return LD;
+    
+    // For SSE 4.1, use inserts into undef.  
+    if (getSubtarget()->hasSSE41()) {
       V[0] = DAG.getUNDEF(VT);
       for (unsigned i = 0; i < NumElems; ++i)
         if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
@@ -3856,7 +3945,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
                              Op.getOperand(i), DAG.getIntPtrConstant(i));
       return V[0];
     }
-    // Expand into a number of unpckl*.
+    
+    // Otherwise, expand into a number of unpckl*
     // e.g. for v4f32
     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
@@ -3871,7 +3961,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     }
     return V[0];
   }
-
   return SDValue();
 }
 
@@ -8797,83 +8886,24 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
-                                     EVT EltVT, LoadSDNode *&LDBase,
-                                     unsigned &LastLoadedElt,
-                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
-                                     const TargetLowering &TLI) {
-  LDBase = NULL;
-  LastLoadedElt = -1U;
-  for (unsigned i = 0; i < NumElems; ++i) {
-    if (N->getMaskElt(i) < 0) {
-      if (!LDBase)
-        return false;
-      continue;
-    }
-
-    SDValue Elt = DAG.getShuffleScalarElt(N, i);
-    if (!Elt.getNode() ||
-        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
-      return false;
-    if (!LDBase) {
-      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
-        return false;
-      LDBase = cast<LoadSDNode>(Elt.getNode());
-      LastLoadedElt = i;
-      continue;
-    }
-    if (Elt.getOpcode() == ISD::UNDEF)
-      continue;
-
-    LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
-      return false;
-    LastLoadedElt = i;
-  }
-  return true;
-}
-
 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
 /// if the load addresses are consecutive, non-overlapping, and in the right
-/// order.  In the case of v2i64, it will see if it can rewrite the
-/// shuffle to be an appropriate build vector so it can take advantage of
-// performBuildVectorCombine.
+/// order.
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      const TargetLowering &TLI) {
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
-  EVT EltVT = VT.getVectorElementType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-  unsigned NumElems = VT.getVectorNumElements();
 
   if (VT.getSizeInBits() != 128)
     return SDValue();
 
-  // Try to combine a vector_shuffle into a 128-bit load.
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  LoadSDNode *LD = NULL;
-  unsigned LastLoadedElt;
-  if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
-                                MFI, TLI))
-    return SDValue();
-
-  if (LastLoadedElt == NumElems - 1) {
-    if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
-      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                         LD->getSrcValue(), LD->getSrcValueOffset(),
-                         LD->isVolatile(), LD->isNonTemporal(), 0);
-    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                       LD->getSrcValue(), LD->getSrcValueOffset(),
-                       LD->isVolatile(), LD->isNonTemporal(),
-                       LD->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1) {
-    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
-    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
-    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
-  }
-  return SDValue();
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
+  
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
 }
 
 /// PerformShuffleCombine - Detect vector gather/scatter index generation
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0f15ebafb0e5..4549cba6f397 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -417,12 +417,15 @@ namespace llvm {
     virtual unsigned getByValTypeAlignment(const Type *Ty) const;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
-    /// and store operations as a result of memset, memcpy, and memmove
-    /// lowering. It returns EVT::iAny if SelectionDAG should be responsible for
-    /// determining it.
-    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                    bool isSrcConst, bool isSrcStr,
-                                    SelectionDAG &DAG) const;
+    /// and store operations as a result of memset, memcpy, and memmove lowering.
+    /// If DstAlign is zero that means it's safe to destination alignment can
+    /// satisfy any constraint. Similarly if SrcAlign is zero it means there
+    /// isn't a need to check it against alignment requirement, probably because
+    /// the source does not need to be loaded. It returns EVT::Other if
+    /// SelectionDAG should be responsible for determining it.
+    virtual EVT getOptimalMemOpType(uint64_t Size,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool SafeToUseFP, SelectionDAG &DAG) const;
 
     /// allowsUnalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses. of the specified type.
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 8cbb756441e6..eef2ca078967 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -295,19 +295,17 @@ def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
 let Defs = [EFLAGS] in {
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsf GR64:$src)), (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsf (loadi64 addr:$src))),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB;
 
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsr GR64:$src)), (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsr (loadi64 addr:$src))),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB;
 } // Defs = [EFLAGS]
 
 // Repeat string ops
@@ -508,8 +506,8 @@ let isCommutable = 1 in
 def ADD64rr    : RI<0x01, MRMDestReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
                     "add{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (add GR64:$src1, GR64:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86add_flag GR64:$src1, GR64:$src2))]>;
 
 // These are alternate spellings for use by the disassembler, we mark them as
 // code gen only to ensure they aren't matched by the assembler.
@@ -523,21 +521,21 @@ let isCodeGenOnly = 1 in {
 def ADD64ri8  : RIi8<0x83, MRM0r, (outs GR64:$dst), 
                      (ins GR64:$src1, i64i8imm:$src2),
                      "add{q}\t{$src2, $dst|$dst, $src2}",
-                     [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2)),
-                      (implicit EFLAGS)]>;
+                     [(set GR64:$dst, EFLAGS,
+                           (X86add_flag GR64:$src1, i64immSExt8:$src2))]>;
 def ADD64ri32 : RIi32<0x81, MRM0r, (outs GR64:$dst), 
                       (ins GR64:$src1, i64i32imm:$src2),
                       "add{q}\t{$src2, $dst|$dst, $src2}",
-                      [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86add_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isConvertibleToThreeAddress
 
 // Register-Memory Addition
 def ADD64rm     : RI<0x03, MRMSrcMem, (outs GR64:$dst), 
                      (ins GR64:$src1, i64mem:$src2),
                      "add{q}\t{$src2, $dst|$dst, $src2}",
-                     [(set GR64:$dst, (add GR64:$src1, (load addr:$src2))),
-                      (implicit EFLAGS)]>;
+                     [(set GR64:$dst, EFLAGS,
+                           (X86add_flag GR64:$src1, (load addr:$src2)))]>;
 
 } // isTwoAddress
 
@@ -604,8 +602,8 @@ let isTwoAddress = 1 in {
 def SUB64rr  : RI<0x29, MRMDestReg, (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2),
                   "sub{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (sub GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86sub_flag GR64:$src1, GR64:$src2))]>;
 
 def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst), 
                      (ins GR64:$src1, GR64:$src2),
@@ -615,20 +613,20 @@ def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst),
 def SUB64rm  : RI<0x2B, MRMSrcMem, (outs GR64:$dst), 
                   (ins GR64:$src1, i64mem:$src2),
                   "sub{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS, 
+                        (X86sub_flag GR64:$src1, (load addr:$src2)))]>;
 
 // Register-Integer Subtraction
 def SUB64ri8 : RIi8<0x83, MRM5r, (outs GR64:$dst),
                                  (ins GR64:$src1, i64i8imm:$src2),
                     "sub{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86sub_flag GR64:$src1, i64immSExt8:$src2))]>;
 def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
                                    (ins GR64:$src1, i64i32imm:$src2),
                       "sub{q}\t{$src2, $dst|$dst, $src2}",
-                      [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86sub_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def SUB64i32 : RIi32<0x2D, RawFrm, (outs), (ins i32imm:$src),
@@ -716,15 +714,15 @@ let isCommutable = 1 in
 def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
                                    (ins GR64:$src1, GR64:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (mul GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, GR64:$src2))]>, TB;
 
 // Register-Memory Signed Integer Multiplication
 def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
                                    (ins GR64:$src1, i64mem:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB;
 } // isTwoAddress
 
 // Suprisingly enough, these are not two address instructions!
@@ -733,27 +731,27 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
 def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
                       (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
                       "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>;
 def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
                         (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2)),
-                        (implicit EFLAGS)]>;
+                       [(set GR64:$dst, EFLAGS,
+                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>;
 
 // Memory-Integer Signed Integer Multiplication
 def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
                       (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
                       "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR64:$dst, (mul (load addr:$src1),
-                                            i64immSExt8:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1),
+                                          i64immSExt8:$src2))]>;
 def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
                         (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set GR64:$dst, (mul (load addr:$src1),
-                                              i64immSExt32:$src2)),
-                         (implicit EFLAGS)]>;
+                        [(set GR64:$dst, EFLAGS,
+                              (X86smul_flag (load addr:$src1),
+                                            i64immSExt32:$src2))]>;
 } // Defs = [EFLAGS]
 
 // Unsigned division / remainder
@@ -787,16 +785,14 @@ def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
 
 let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
 def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst",
-                [(set GR64:$dst, (add GR64:$src, 1)),
-                 (implicit EFLAGS)]>;
+                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src))]>;
 def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
                 [(store (add (loadi64 addr:$dst), 1), addr:$dst),
                  (implicit EFLAGS)]>;
 
 let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
 def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst",
-                [(set GR64:$dst, (add GR64:$src, -1)),
-                 (implicit EFLAGS)]>;
+                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src))]>;
 def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
                 [(store (add (loadi64 addr:$dst), -1), addr:$dst),
                  (implicit EFLAGS)]>;
@@ -806,23 +802,19 @@ let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
 // Can transform into LEA.
 def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src), 
                   "inc{w}\t$dst",
-                  [(set GR16:$dst, (add GR16:$src, 1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>,
                 OpSize, Requires<[In64BitMode]>;
 def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src), 
                   "inc{l}\t$dst",
-                  [(set GR32:$dst, (add GR32:$src, 1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>,
                 Requires<[In64BitMode]>;
 def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src), 
                   "dec{w}\t$dst",
-                  [(set GR16:$dst, (add GR16:$src, -1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>,
                 OpSize, Requires<[In64BitMode]>;
 def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src), 
                   "dec{l}\t$dst",
-                  [(set GR32:$dst, (add GR32:$src, -1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
                 Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress
 
@@ -1092,26 +1084,26 @@ let isCommutable = 1 in
 def AND64rr  : RI<0x21, MRMDestReg, 
                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
                   "and{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (and GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86and_flag GR64:$src1, GR64:$src2))]>;
 def AND64rr_REV : RI<0x23, MRMSrcReg, (outs GR64:$dst), 
                      (ins GR64:$src1, GR64:$src2),
                      "and{q}\t{$src2, $dst|$dst, $src2}", []>;
 def AND64rm  : RI<0x23, MRMSrcMem,
                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
                   "and{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (and GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86and_flag GR64:$src1, (load addr:$src2)))]>;
 def AND64ri8 : RIi8<0x83, MRM4r, 
                     (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
                     "and{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86and_flag GR64:$src1, i64immSExt8:$src2))]>;
 def AND64ri32  : RIi32<0x81, MRM4r, 
                        (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
                        "and{q}\t{$src2, $dst|$dst, $src2}",
-                       [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2)),
-                        (implicit EFLAGS)]>;
+                       [(set GR64:$dst, EFLAGS,
+                             (X86and_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def AND64mr  : RI<0x21, MRMDestMem,
@@ -1135,26 +1127,26 @@ let isCommutable = 1 in
 def OR64rr   : RI<0x09, MRMDestReg, (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2),
                   "or{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86or_flag GR64:$src1, GR64:$src2))]>;
 def OR64rr_REV : RI<0x0B, MRMSrcReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
                     "or{q}\t{$src2, $dst|$dst, $src2}", []>;
 def OR64rm   : RI<0x0B, MRMSrcMem , (outs GR64:$dst),
                   (ins GR64:$src1, i64mem:$src2),
                   "or{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (or GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86or_flag GR64:$src1, (load addr:$src2)))]>;
 def OR64ri8  : RIi8<0x83, MRM1r, (outs GR64:$dst),
                     (ins GR64:$src1, i64i8imm:$src2),
                     "or{q}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR64:$dst, EFLAGS,
+                         (X86or_flag GR64:$src1, i64immSExt8:$src2))]>;
 def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst),
                      (ins GR64:$src1, i64i32imm:$src2),
                      "or{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2)),
-                    (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86or_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
@@ -1178,26 +1170,26 @@ let isCommutable = 1 in
 def XOR64rr  : RI<0x31, MRMDestReg,  (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2), 
                   "xor{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (xor GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86xor_flag GR64:$src1, GR64:$src2))]>;
 def XOR64rr_REV : RI<0x33, MRMSrcReg, (outs GR64:$dst), 
                      (ins GR64:$src1, GR64:$src2),
                     "xor{q}\t{$src2, $dst|$dst, $src2}", []>;
 def XOR64rm  : RI<0x33, MRMSrcMem, (outs GR64:$dst), 
                   (ins GR64:$src1, i64mem:$src2), 
                   "xor{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86xor_flag GR64:$src1, (load addr:$src2)))]>;
 def XOR64ri8 : RIi8<0x83, MRM6r,  (outs GR64:$dst), 
                     (ins GR64:$src1, i64i8imm:$src2),
                     "xor{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86xor_flag GR64:$src1, i64immSExt8:$src2))]>;
 def XOR64ri32 : RIi32<0x81, MRM6r, 
                       (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), 
                       "xor{q}\t{$src2, $dst|$dst, $src2}",
-                      [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86xor_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def XOR64mr  : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
@@ -2181,14 +2173,11 @@ def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
 
 // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
 let AddedComplexity = 5 in {  // Try this before the selecting to OR
-def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR64:$src1, i64immSExt8:$src2),
           (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR64:$src1, i64immSExt32:$src2),
           (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
-def : Pat<(parallel (or_is_add GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR64:$src1, GR64:$src2),
           (ADD64rr GR64:$src1, GR64:$src2)>;
 } // AddedComplexity
 
@@ -2215,136 +2204,76 @@ def : Pat<(subc GR64:$src1, imm:$src2),
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
 
-// Register-Register Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
           (ADD64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Integer Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
           (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
           (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
           (ADD64rm GR64:$src1, addr:$src2)>;
 
-// Register-Register Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
           (SUB64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Memory Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
           (SUB64rm GR64:$src1, addr:$src2)>;
-
-// Register-Integer Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
           (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
           (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
-// Register-Register Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
           (IMUL64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Memory Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
           (IMUL64rm GR64:$src1, addr:$src2)>;
-
-// Register-Integer Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
           (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
           (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Memory-Integer Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
           (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
           (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
 
-// INC and DEC with EFLAGS result. Note that these do not set CF.
-def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
-          (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
-          (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+// inc/dec
+def : Pat<(add GR16:$src, 1),  (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, 1),  (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>;
+def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
 
-def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
-          (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
-          (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-
-def : Pat<(parallel (X86inc_flag GR64:$src), (implicit EFLAGS)),
-          (INC64r GR64:$src)>;
-def : Pat<(parallel (X86dec_flag GR64:$src), (implicit EFLAGS)),
-          (DEC64r GR64:$src)>;
-
-// Register-Register Logical Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// or
+def : Pat<(or GR64:$src1, GR64:$src2),
           (OR64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Integer Logical Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
           (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
           (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Logical Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
           (OR64rm GR64:$src1, addr:$src2)>;
 
-// Register-Register Logical XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// xor
+def : Pat<(xor GR64:$src1, GR64:$src2),
           (XOR64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Integer Logical XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
           (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
           (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Logical XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
           (XOR64rm GR64:$src1, addr:$src2)>;
 
-// Register-Register Logical And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// and
+def : Pat<(and GR64:$src1, GR64:$src2),
           (AND64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Integer Logical And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
           (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
           (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Logical And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
           (AND64rm GR64:$src1, addr:$src2)>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index bb81cbf8ac18..d25ec260491e 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -68,6 +68,16 @@ def CompareFP  : FPFormat<5>;
 def CondMovFP  : FPFormat<6>;
 def SpecialFP  : FPFormat<7>;
 
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+  bits<2> Value = val;
+}
+def GenericDomain   : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt    : Domain<3>;
+
 // Prefix byte classes which are used to indicate to the ad-hoc machine code
 // emitter that various prefix bytes are required.
 class OpSize { bit hasOpSizePrefix = 1; }
@@ -93,7 +103,7 @@ class TA     { bits<4> Prefix = 14; }
 class TF     { bits<4> Prefix = 15; }
 
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
-              string AsmStr>
+              string AsmStr, Domain d = GenericDomain>
   : Instruction {
   let Namespace = "X86";
 
@@ -101,7 +111,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   Format Form = f;
   bits<6> FormBits = Form.Value;
   ImmType ImmT = i;
-  bits<3> ImmTypeBits = ImmT.Value;
 
   dag OutOperandList = outs;
   dag InOperandList = ins;
@@ -115,20 +124,21 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
 
   bits<4> Prefix = 0;       // Which prefix byte does this inst have?
   bit hasREX_WPrefix  = 0;  // Does this inst requires the REX.W prefix?
-  FPFormat FPForm;          // What flavor of FP instruction is this?
-  bits<3> FPFormBits = 0;
+  FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
   bits<2> SegOvrBits = 0;   // Segment override prefix.
+  Domain ExeDomain = d;
 }
 
-class I<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
-  : X86Inst<o, f, NoImm, outs, ins, asm> {
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+        list<dag> pattern, Domain d = GenericDomain>
+  : X86Inst<o, f, NoImm, outs, ins, asm, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, 
-           list<dag> pattern>
-  : X86Inst<o, f, Imm8 , outs, ins, asm> {
+           list<dag> pattern, Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8, outs, ins, asm, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
@@ -166,7 +176,7 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
 // FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
 class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
   : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
-  let FPForm = fp; let FPFormBits = FPForm.Value;
+  let FPForm = fp;
   let Pattern = pattern;
 }
 
@@ -196,14 +206,16 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
       : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
-class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+        Requires<[HasSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+        Requires<[HasSSE1]>;
 
 // SSE2 Instruction Templates:
 // 
@@ -222,10 +234,12 @@ class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+        Requires<[HasSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+        Requires<[HasSSE2]>;
 
 // SSE3 Instruction Templates:
 // 
@@ -235,12 +249,15 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE3]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
+        Requires<[HasSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE3]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
+        Requires<[HasSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+        Requires<[HasSSE3]>;
 
 
 // SSSE3 Instruction Templates:
@@ -254,10 +271,12 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
 
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+        Requires<[HasSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
 // 
@@ -266,17 +285,20 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+        Requires<[HasSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        Requires<[HasSSE41]>;
 
 // SSE4.2 Instruction Templates:
 // 
 //   SS428I - SSE 4.2 instructions with T8 prefix.
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE42]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+        Requires<[HasSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with TF prefix.
 class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -286,7 +308,8 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SS42AI = SSE 4.2 instructions with TA prefix
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE42]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        Requires<[HasSSE42]>;
 
 // X86-64 Instruction templates...
 //
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 139a905ec38d..c0c9d98ffea8 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -597,7 +597,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::PMULHUWrr,       X86::PMULHUWrm, 16 },
     { X86::PMULHWrr,        X86::PMULHWrm, 16 },
     { X86::PMULLDrr,        X86::PMULLDrm, 16 },
-    { X86::PMULLDrr_int,    X86::PMULLDrm_int, 16 },
     { X86::PMULLWrr,        X86::PMULLWrm, 16 },
     { X86::PMULUDQrr,       X86::PMULUDQrm, 16 },
     { X86::PORrr,           X86::PORrm, 16 },
@@ -992,8 +991,10 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
 /// a few instructions in each direction it assumes it's not safe.
 static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator E = MBB.end();
+
   // It's always safe to clobber EFLAGS at the end of a block.
-  if (I == MBB.end())
+  if (I == E)
     return true;
 
   // For compile time consideration, if we are not able to determine the
@@ -1017,20 +1018,28 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
       // This instruction defines EFLAGS, no need to look any further.
       return true;
     ++Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != E && Iter->isDebugValue())
+      ++Iter;
 
     // If we make it to the end of the block, it's safe to clobber EFLAGS.
-    if (Iter == MBB.end())
+    if (Iter == E)
       return true;
   }
 
+  MachineBasicBlock::iterator B = MBB.begin();
   Iter = I;
   for (unsigned i = 0; i < 4; ++i) {
     // If we make it to the beginning of the block, it's safe to clobber
     // EFLAGS iff EFLAGS is not live-in.
-    if (Iter == MBB.begin())
+    if (Iter == B)
       return !MBB.isLiveIn(X86::EFLAGS);
 
     --Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != B && Iter->isDebugValue())
+      --Iter;
+
     bool SawKill = false;
     for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
       MachineOperand &MO = Iter->getOperand(j);
@@ -1677,6 +1686,8 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
 
     // Working from the bottom, when we see a non-terminator instruction, we're
     // done.
@@ -1773,6 +1784,8 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     if (I->getOpcode() != X86::JMP_4 &&
         GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
@@ -2505,7 +2518,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
-    case X86::V_SET0:
+    case X86::V_SET0PS:
+    case X86::V_SET0PD:
+    case X86::V_SET0PI:
     case X86::V_SETALLONES:
       Alignment = 16;
       break;
@@ -2535,11 +2550,13 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   SmallVector<MachineOperand,X86AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
-  case X86::V_SET0:
+  case X86::V_SET0PS:
+  case X86::V_SET0PD:
+  case X86::V_SET0PI:
   case X86::V_SETALLONES:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
-    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
     // Medium and large mode can't fold loads this way.
@@ -3648,3 +3665,51 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   X86FI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const unsigned ReplaceableInstrs[][3] = {
+  //PackedInt       PackedSingle     PackedDouble
+  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
+  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
+  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
+  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
+  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
+  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
+  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
+  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
+  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
+  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
+  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
+  { X86::V_SET0PS,   X86::V_SET0PD,  X86::V_SET0PI  },
+  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
+  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const unsigned *lookup(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
+    if (ReplaceableInstrs[i][domain-1] == opcode)
+      return ReplaceableInstrs[i];
+  return 0;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::GetSSEDomain(const MachineInstr *MI) const {
+  uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  return std::make_pair(domain,
+                        domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
+}
+
+void X86InstrInfo::SetSSEDomain(MachineInstr *MI, unsigned Domain) const {
+  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+  const unsigned *table = lookup(MI->getOpcode(), dom);
+  assert(table && "Cannot change domain");
+  MI->setDesc(get(table[Domain-1]));
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5111719a2094..f0bdd06cce88 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -398,7 +398,10 @@ namespace X86II {
     FS          = 1 << SegOvrShift,
     GS          = 2 << SegOvrShift,
 
-    // Bits 22 -> 23 are unused
+    // Execution domain for SSE instructions in bits 22, 23.
+    // 0 in bits 22-23 means normal, non-SSE instruction.
+    SSEDomainShift = 22,
+
     OpcodeShift   = 24,
     OpcodeMask    = 0xFF << OpcodeShift
   };
@@ -486,7 +489,7 @@ class X86InstrInfo : public TargetInstrInfoImpl {
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
   DenseMap<unsigned*, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
-  
+
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
 
@@ -716,6 +719,13 @@ public:
   ///
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
+  /// GetSSEDomain - Return the SSE execution domain of MI as the first element,
+  /// and a bitmask of possible arguments to SetSSEDomain ase the second.
+  std::pair<uint16_t, uint16_t> GetSSEDomain(const MachineInstr *MI) const;
+
+  /// SetSSEDomain - Set the SSEDomain of MI.
+  void SetSSEDomain(MachineInstr *MI, unsigned Domain) const;
+
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
                                               MachineFunction::iterator &MFI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index c80a18dbf60b..8fccc8a37aca 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1,4 +1,4 @@
-
+//===----------------------------------------------------------------------===//
 // 
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,12 +28,13 @@ def SDTX86Cmov    : SDTypeProfile<1, 4,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
 
 // Unary and binary operator instructions that set EFLAGS as a side-effect.
-def SDTUnaryArithWithFlags  : SDTypeProfile<1, 1,
-                                            [SDTCisInt<0>]>;
-def SDTBinaryArithWithFlags : SDTypeProfile<1, 2,
-                                            [SDTCisSameAs<0, 1>,
-                                             SDTCisSameAs<0, 2>,
-                                             SDTCisInt<0>]>;
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+                                           [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
 def SDTX86BrCond  : SDTypeProfile<0, 3,
                                   [SDTCisVT<0, OtherVT>,
                                    SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
@@ -77,8 +78,8 @@ def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
 
-def X86bsf     : SDNode<"X86ISD::BSF",      SDTIntUnaryOp>;
-def X86bsr     : SDNode<"X86ISD::BSR",      SDTIntUnaryOp>;
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
 def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
 def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
 
@@ -167,6 +168,7 @@ def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags,
                           [SDNPCommutative]>;
+                          
 def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
 def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
 def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags,
@@ -323,6 +325,7 @@ def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
 def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
                              "TM.getCodeModel() == CodeModel::Kernel">;
 def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC     : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
 def OptForSize   : Predicate<"OptForSize">;
 def OptForSpeed  : Predicate<"!OptForSize">;
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
@@ -473,15 +476,14 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
-  else {
-    unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
-    APInt Mask = APInt::getAllOnesValue(BitWidth);
-    APInt KnownZero0, KnownOne0;
-    CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
-    APInt KnownZero1, KnownOne1;
-    CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
-    return (~KnownZero0 & ~KnownZero1) == 0;
-  }
+
+  unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+  APInt Mask = APInt::getAllOnesValue(BitWidth);
+  APInt KnownZero0, KnownOne0;
+  CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
 }]>;
 
 // 'shld' and 'shrd' instruction patterns. Note that even though these have
@@ -585,7 +587,7 @@ let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
 
 // Return instructions.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
-    hasCtrlDep = 1, FPForm = SpecialFP, FPFormBits = SpecialFP.Value in {
+    hasCtrlDep = 1, FPForm = SpecialFP in {
   def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
                     [(X86retflag 0)]>;
@@ -806,33 +808,29 @@ let isTwoAddress = 1 in                               // GR32 = bswap GR32
 let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsf GR16:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsf (loadi16 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsf GR32:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsf (loadi32 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsr GR16:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsr (loadi16 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsr GR32:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsr (loadi32 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB;
 } // Defs = [EFLAGS]
 
 let neverHasSideEffects = 1 in
@@ -1697,18 +1695,17 @@ let isTwoAddress = 0 in {
 let Defs = [EFLAGS] in {
 let CodeSize = 2 in
 def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst",
-               [(set GR8:$dst, (add GR8:$src, 1)),
-                (implicit EFLAGS)]>;
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src))]>;
+
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
 def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
                "inc{w}\t$dst",
-               [(set GR16:$dst, (add GR16:$src, 1)),
-                (implicit EFLAGS)]>,
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>,
              OpSize, Requires<[In32BitMode]>;
 def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
                "inc{l}\t$dst",
-               [(set GR32:$dst, (add GR32:$src, 1)),
-                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>,
+             Requires<[In32BitMode]>;
 }
 let isTwoAddress = 0, CodeSize = 2 in {
   def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
@@ -1726,18 +1723,16 @@ let isTwoAddress = 0, CodeSize = 2 in {
 
 let CodeSize = 2 in
 def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst",
-               [(set GR8:$dst, (add GR8:$src, -1)),
-                (implicit EFLAGS)]>;
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src))]>;
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
 def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
                "dec{w}\t$dst",
-               [(set GR16:$dst, (add GR16:$src, -1)),
-                (implicit EFLAGS)]>,
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>,
              OpSize, Requires<[In32BitMode]>;
 def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
                "dec{l}\t$dst",
-               [(set GR32:$dst, (add GR32:$src, -1)),
-                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
+             Requires<[In32BitMode]>;
 }
 
 let isTwoAddress = 0, CodeSize = 2 in {
@@ -1758,21 +1753,20 @@ let isTwoAddress = 0, CodeSize = 2 in {
 // Logical operators...
 let Defs = [EFLAGS] in {
 let isCommutable = 1 in {   // X = AND Y, Z   --> X = AND Z, Y
-def AND8rr   : I<0x20, MRMDestReg,
-                (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
-                "and{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
-                 (implicit EFLAGS)]>;
-def AND16rr  : I<0x21, MRMDestReg,
-                 (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
-                 "and{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, OpSize;
-def AND32rr  : I<0x21, MRMDestReg, 
-                 (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
-                 "and{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (and GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>;
+def AND8rr  : I<0x20, MRMDestReg,
+               (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+               "and{b}\t{$src2, $dst|$dst, $src2}",
+               [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1, GR8:$src2))]>;
+def AND16rr : I<0x21, MRMDestReg,
+                (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                "and{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                      GR16:$src2))]>, OpSize;
+def AND32rr : I<0x21, MRMDestReg, 
+                (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                "and{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                      GR32:$src2))]>;
 }
 
 // AND instructions with the destination register in REG and the source register
@@ -1789,45 +1783,46 @@ def AND32rr_REV : I<0x23, MRMSrcReg, (outs GR32:$dst),
 def AND8rm   : I<0x22, MRMSrcMem, 
                  (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
                  "and{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (and GR8:$src1, (loadi8 addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1,
+                                                     (loadi8 addr:$src2)))]>;
 def AND16rm  : I<0x23, MRMSrcMem, 
                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
                  "and{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (and GR16:$src1, (loadi16 addr:$src2))),
-                 (implicit EFLAGS)]>, OpSize;
+                [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                      (loadi16 addr:$src2)))]>,
+               OpSize;
 def AND32rm  : I<0x23, MRMSrcMem,
                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
                  "and{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (and GR32:$src1, (loadi32 addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                      (loadi32 addr:$src2)))]>;
 
 def AND8ri   : Ii8<0x80, MRM4r, 
                    (outs GR8 :$dst), (ins GR8 :$src1, i8imm :$src2),
                    "and{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1,
+                                                        imm:$src2))]>;
 def AND16ri  : Ii16<0x81, MRM4r, 
                     (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
                     "and{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                          imm:$src2))]>, OpSize;
 def AND32ri  : Ii32<0x81, MRM4r, 
                     (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
                     "and{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (and GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                          imm:$src2))]>;
 def AND16ri8 : Ii8<0x83, MRM4r, 
                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                    "and{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>,
+                   [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                         i16immSExt8:$src2))]>,
                    OpSize;
 def AND32ri8 : Ii8<0x83, MRM4r, 
                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                    "and{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                         i32immSExt8:$src2))]>;
 
 let isTwoAddress = 0 in {
   def AND8mr   : I<0x20, MRMDestMem,
@@ -1888,18 +1883,16 @@ let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
 def OR8rr    : I<0x08, MRMDestReg, (outs GR8 :$dst), 
                  (ins GR8 :$src1, GR8 :$src2),
                  "or{b}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR8:$dst, (or GR8:$src1, GR8:$src2)),
-                  (implicit EFLAGS)]>;
+                 [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1, GR8:$src2))]>;
 def OR16rr   : I<0x09, MRMDestReg, (outs GR16:$dst), 
                  (ins GR16:$src1, GR16:$src2),
                  "or{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (or GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,GR16:$src2))]>,
+               OpSize;
 def OR32rr   : I<0x09, MRMDestReg, (outs GR32:$dst), 
                  (ins GR32:$src1, GR32:$src2),
                  "or{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>;
+                 [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,GR32:$src2))]>;
 }
 
 // OR instructions with the destination register in REG and the source register
@@ -1913,48 +1906,48 @@ def OR32rr_REV : I<0x0B, MRMSrcReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "or{l}\t{$src2, $dst|$dst, $src2}", []>;
                   
-def OR8rm    : I<0x0A, MRMSrcMem , (outs GR8 :$dst), 
+def OR8rm    : I<0x0A, MRMSrcMem, (outs GR8 :$dst), 
                  (ins GR8 :$src1, i8mem :$src2),
                  "or{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (or GR8:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
-def OR16rm   : I<0x0B, MRMSrcMem , (outs GR16:$dst), 
+                [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1,
+                                                    (load addr:$src2)))]>;
+def OR16rm   : I<0x0B, MRMSrcMem, (outs GR16:$dst), 
                  (ins GR16:$src1, i16mem:$src2),
                  "or{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (or GR16:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>, OpSize;
-def OR32rm   : I<0x0B, MRMSrcMem , (outs GR32:$dst), 
+                [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+                                                     (load addr:$src2)))]>,
+               OpSize;
+def OR32rm   : I<0x0B, MRMSrcMem, (outs GR32:$dst), 
                  (ins GR32:$src1, i32mem:$src2),
                  "or{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (or GR32:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+                                                     (load addr:$src2)))]>;
 
 def OR8ri    : Ii8 <0x80, MRM1r, (outs GR8 :$dst), 
                     (ins GR8 :$src1, i8imm:$src2),
                     "or{b}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR8:$dst, (or GR8:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR8:$dst,EFLAGS, (X86or_flag GR8:$src1, imm:$src2))]>;
 def OR16ri   : Ii16<0x81, MRM1r, (outs GR16:$dst), 
                     (ins GR16:$src1, i16imm:$src2),
                     "or{w}\t{$src2, $dst|$dst, $src2}", 
-                    [(set GR16:$dst, (or GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+                                                        imm:$src2))]>, OpSize;
 def OR32ri   : Ii32<0x81, MRM1r, (outs GR32:$dst), 
                     (ins GR32:$src1, i32imm:$src2),
                     "or{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (or GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+                                                         imm:$src2))]>;
 
 def OR16ri8  : Ii8<0x83, MRM1r, (outs GR16:$dst), 
                    (ins GR16:$src1, i16i8imm:$src2),
                    "or{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+                                                i16immSExt8:$src2))]>, OpSize;
 def OR32ri8  : Ii8<0x83, MRM1r, (outs GR32:$dst), 
                    (ins GR32:$src1, i32i8imm:$src2),
                    "or{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+                                                        i32immSExt8:$src2))]>;
 let isTwoAddress = 0 in {
   def OR8mr  : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
                  "or{b}\t{$src, $dst|$dst, $src}",
@@ -2004,18 +1997,18 @@ let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
   def XOR8rr   : I<0x30, MRMDestReg,
                    (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
                    "xor{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1,
+                                                        GR8:$src2))]>;
   def XOR16rr  : I<0x31, MRMDestReg, 
                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), 
                    "xor{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                         GR16:$src2))]>, OpSize;
   def XOR32rr  : I<0x31, MRMDestReg, 
                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), 
                    "xor{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (xor GR32:$src1, GR32:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                         GR32:$src2))]>;
 } // isCommutable = 1
 
 // XOR instructions with the destination register in REG and the source register
@@ -2029,49 +2022,48 @@ def XOR32rr_REV : I<0x33, MRMSrcReg, (outs GR32:$dst),
                     (ins GR32:$src1, GR32:$src2),
                    "xor{l}\t{$src2, $dst|$dst, $src2}", []>;
 
-def XOR8rm   : I<0x32, MRMSrcMem , 
+def XOR8rm   : I<0x32, MRMSrcMem, 
                  (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2), 
                  "xor{b}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
-def XOR16rm  : I<0x33, MRMSrcMem , 
+                 [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1,
+                                                      (load addr:$src2)))]>;
+def XOR16rm  : I<0x33, MRMSrcMem, 
                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), 
                  "xor{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>,
+                 [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                       (load addr:$src2)))]>,
                  OpSize;
-def XOR32rm  : I<0x33, MRMSrcMem , 
+def XOR32rm  : I<0x33, MRMSrcMem, 
                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), 
                  "xor{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
+                 [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                       (load addr:$src2)))]>;
 
-def XOR8ri   : Ii8<0x80, MRM6r, 
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), 
-                   "xor{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
-                    (implicit EFLAGS)]>;
-def XOR16ri  : Ii16<0x81, MRM6r, 
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), 
-                    "xor{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+def XOR8ri  : Ii8<0x80, MRM6r, 
+                  (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), 
+                  "xor{b}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1, imm:$src2))]>;
+def XOR16ri : Ii16<0x81, MRM6r, 
+                   (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), 
+                   "xor{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                         imm:$src2))]>, OpSize;
 def XOR32ri  : Ii32<0x81, MRM6r, 
                     (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), 
                     "xor{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (xor GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                          imm:$src2))]>;
 def XOR16ri8 : Ii8<0x83, MRM6r, 
                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                    "xor{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>,
+                   [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                         i16immSExt8:$src2))]>,
                    OpSize;
 def XOR32ri8 : Ii8<0x83, MRM6r, 
                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                    "xor{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                         i32immSExt8:$src2))]>;
 
 let isTwoAddress = 0 in {
   def XOR8mr   : I<0x30, MRMDestMem,
@@ -2118,12 +2110,12 @@ let isTwoAddress = 0 in {
                  [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst),
                   (implicit EFLAGS)]>;
                   
-  def XOR8i8 : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src),
-                   "xor{b}\t{$src, %al|%al, $src}", []>;
-  def XOR16i16 : Ii16 <0x35, RawFrm, (outs), (ins i16imm:$src),
-                        "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
-  def XOR32i32 : Ii32 <0x35, RawFrm, (outs), (ins i32imm:$src),
-                        "xor{l}\t{$src, %eax|%eax, $src}", []>;
+  def XOR8i8   : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src),
+                      "xor{b}\t{$src, %al|%al, $src}", []>;
+  def XOR16i16 : Ii16<0x35, RawFrm, (outs), (ins i16imm:$src),
+                      "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def XOR32i32 : Ii32<0x35, RawFrm, (outs), (ins i32imm:$src),
+                      "xor{l}\t{$src, %eax|%eax, $src}", []>;
 } // isTwoAddress = 0
 } // Defs = [EFLAGS]
 
@@ -2690,21 +2682,20 @@ let isCommutable = 1 in {   // X = ADD Y, Z   --> X = ADD Z, Y
 def ADD8rr    : I<0x00, MRMDestReg, (outs GR8 :$dst),
                                     (ins GR8 :$src1, GR8 :$src2),
                   "add{b}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1, GR8:$src2))]>;
 
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 // Register-Register Addition
 def ADD16rr  : I<0x01, MRMDestReg, (outs GR16:$dst),
                                    (ins GR16:$src1, GR16:$src2),
                  "add{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1,
+                                                       GR16:$src2))]>, OpSize;
 def ADD32rr  : I<0x01, MRMDestReg, (outs GR32:$dst),
                                    (ins GR32:$src1, GR32:$src2),
                  "add{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (add GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>;
+                 [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1,
+                                                       GR32:$src2))]>;
 } // end isConvertibleToThreeAddress
 } // end isCommutable
 
@@ -2723,47 +2714,47 @@ let isCodeGenOnly = 1 in {
 def ADD8rm   : I<0x02, MRMSrcMem, (outs GR8 :$dst),
                                   (ins GR8 :$src1, i8mem :$src2),
                  "add{b}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
+                 [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1,
+                                                      (load addr:$src2)))]>;
 def ADD16rm  : I<0x03, MRMSrcMem, (outs GR16:$dst),
                                   (ins GR16:$src1, i16mem:$src2),
                  "add{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1,
+                                                  (load addr:$src2)))]>, OpSize;
 def ADD32rm  : I<0x03, MRMSrcMem, (outs GR32:$dst),
                                   (ins GR32:$src1, i32mem:$src2),
                  "add{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (add GR32:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
+                 [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1,
+                                                       (load addr:$src2)))]>;
                   
 // Register-Integer Addition
 def ADD8ri    : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
                     "add{b}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR8:$dst, EFLAGS,
+                          (X86add_flag GR8:$src1, imm:$src2))]>;
 
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 // Register-Integer Addition
 def ADD16ri  : Ii16<0x81, MRM0r, (outs GR16:$dst),
                                  (ins GR16:$src1, i16imm:$src2),
                     "add{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS,
+                          (X86add_flag GR16:$src1, imm:$src2))]>, OpSize;
 def ADD32ri  : Ii32<0x81, MRM0r, (outs GR32:$dst),
                                  (ins GR32:$src1, i32imm:$src2),
                     "add{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (add GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, 
+                          (X86add_flag GR32:$src1, imm:$src2))]>;
 def ADD16ri8 : Ii8<0x83, MRM0r, (outs GR16:$dst),
                                 (ins GR16:$src1, i16i8imm:$src2),
                    "add{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS,
+                         (X86add_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize;
 def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst),
                                 (ins GR32:$src1, i32i8imm:$src2),
                    "add{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS,
+                         (X86add_flag GR32:$src1, i32immSExt8:$src2))]>;
 }
 
 let isTwoAddress = 0 in {
@@ -2911,16 +2902,16 @@ let isTwoAddress = 0 in {
 // Register-Register Subtraction
 def SUB8rr  : I<0x28, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
                 "sub{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
-                 (implicit EFLAGS)]>;
+                [(set GR8:$dst, EFLAGS,
+                      (X86sub_flag GR8:$src1, GR8:$src2))]>;
 def SUB16rr : I<0x29, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
                 "sub{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
-                 (implicit EFLAGS)]>, OpSize;
+                [(set GR16:$dst, EFLAGS,
+                      (X86sub_flag GR16:$src1, GR16:$src2))]>, OpSize;
 def SUB32rr : I<0x29, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
                 "sub{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (sub GR32:$src1, GR32:$src2)),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS,
+                      (X86sub_flag GR32:$src1, GR32:$src2))]>;
 
 def SUB8rr_REV : I<0x2A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>;
@@ -2935,45 +2926,45 @@ def SUB32rr_REV : I<0x2B, MRMSrcReg, (outs GR32:$dst),
 def SUB8rm  : I<0x2A, MRMSrcMem, (outs GR8 :$dst),
                                  (ins GR8 :$src1, i8mem :$src2),
                 "sub{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR8:$dst, EFLAGS,
+                      (X86sub_flag GR8:$src1, (load addr:$src2)))]>;
 def SUB16rm : I<0x2B, MRMSrcMem, (outs GR16:$dst),
                                  (ins GR16:$src1, i16mem:$src2),
                 "sub{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>, OpSize;
+                [(set GR16:$dst, EFLAGS,
+                      (X86sub_flag GR16:$src1, (load addr:$src2)))]>, OpSize;
 def SUB32rm : I<0x2B, MRMSrcMem, (outs GR32:$dst),
                                  (ins GR32:$src1, i32mem:$src2),
                 "sub{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS,
+                      (X86sub_flag GR32:$src1, (load addr:$src2)))]>;
 
 // Register-Integer Subtraction
 def SUB8ri   : Ii8 <0x80, MRM5r, (outs GR8:$dst),
                                  (ins GR8:$src1, i8imm:$src2),
                     "sub{b}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR8:$dst, EFLAGS,
+                          (X86sub_flag GR8:$src1, imm:$src2))]>;
 def SUB16ri  : Ii16<0x81, MRM5r, (outs GR16:$dst),
                                  (ins GR16:$src1, i16imm:$src2),
                     "sub{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS,
+                          (X86sub_flag GR16:$src1, imm:$src2))]>, OpSize;
 def SUB32ri  : Ii32<0x81, MRM5r, (outs GR32:$dst),
                                  (ins GR32:$src1, i32imm:$src2),
                     "sub{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (sub GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS,
+                          (X86sub_flag GR32:$src1, imm:$src2))]>;
 def SUB16ri8 : Ii8<0x83, MRM5r, (outs GR16:$dst),
                                 (ins GR16:$src1, i16i8imm:$src2),
                    "sub{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS,
+                         (X86sub_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize;
 def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst),
                                 (ins GR32:$src1, i32i8imm:$src2),
                    "sub{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS,
+                         (X86sub_flag GR32:$src1, i32immSExt8:$src2))]>;
 
 let isTwoAddress = 0 in {
   // Memory-Register Subtraction
@@ -3122,25 +3113,26 @@ let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
 // Register-Register Signed Integer Multiply
 def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (mul GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, GR16:$src2))]>, TB, OpSize;
 def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (mul GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, GR32:$src2))]>, TB;
 }
 
 // Register-Memory Signed Integer Multiply
 def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
                                   (ins GR16:$src1, i16mem:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, (load addr:$src2)))]>,
+               TB, OpSize;
 def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), 
                  (ins GR32:$src1, i32mem:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, (load addr:$src2)))]>, TB;
 } // Defs = [EFLAGS]
 } // end Two Address instructions
 
@@ -3150,47 +3142,49 @@ let Defs = [EFLAGS] in {
 def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
                       (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR16:$dst, (mul GR16:$src1, imm:$src2)),
-                       (implicit EFLAGS)]>, OpSize;
+                      [(set GR16:$dst, EFLAGS, 
+                            (X86smul_flag GR16:$src1, imm:$src2))]>, OpSize;
 def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
                       (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR32:$dst, (mul GR32:$src1, imm:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag GR32:$src1, imm:$src2))]>;
 def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
                      (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2)),
-                      (implicit EFLAGS)]>, OpSize;
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
+                 OpSize;
 def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
                      (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2)),
-                      (implicit EFLAGS)]>;
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>;
 
 // Memory-Integer Signed Integer Multiply
 def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
                       (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR16:$dst, (mul (load addr:$src1), imm:$src2)),
-                       (implicit EFLAGS)]>, OpSize;
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))]>,
+                 OpSize;
 def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
                       (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR32:$dst, (mul (load addr:$src1), imm:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))]>;
 def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
                      (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR16:$dst, (mul (load addr:$src1),
-                                       i16immSExt8:$src2)),
-                      (implicit EFLAGS)]>, OpSize;
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i16immSExt8:$src2))]>, OpSize;
 def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
                      (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR32:$dst, (mul (load addr:$src1),
-                                           i32immSExt8:$src2)),
-                      (implicit EFLAGS)]>;
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i32immSExt8:$src2))]>;
 } // Defs = [EFLAGS]
 
 //===----------------------------------------------------------------------===//
@@ -4345,9 +4339,12 @@ def : Pat<(X86tcret GR32_TC:$dst, imm:$off),
           (TCRETURNri GR32_TC:$dst, imm:$off)>,
 	  Requires<[In32BitMode]>;
 
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a 
+// callee-saved register.
 def : Pat<(X86tcret (load addr:$dst), imm:$off),
           (TCRETURNmi addr:$dst, imm:$off)>,
-	  Requires<[In32BitMode]>;
+	  Requires<[In32BitMode, IsNotPIC]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
@@ -4722,23 +4719,17 @@ def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 
 // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
 let AddedComplexity = 5 in { // Try this before the selecting to OR
-def : Pat<(parallel (or_is_add GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR16:$src1, imm:$src2),
           (ADD16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (or_is_add GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR32:$src1, imm:$src2),
           (ADD32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (or_is_add GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR16:$src1, i16immSExt8:$src2),
           (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (or_is_add GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR32:$src1, i32immSExt8:$src2),
           (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
-def : Pat<(parallel (or_is_add GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR16:$src1, GR16:$src2),
           (ADD16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (or_is_add GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR32:$src1, GR32:$src2),
           (ADD32rr GR32:$src1, GR32:$src2)>;
 } // AddedComplexity
 
@@ -4746,270 +4737,173 @@ def : Pat<(parallel (or_is_add GR32:$src1, GR32:$src2),
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
 
-// Register-Register Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (ADD8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (ADD16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (ADD32rr GR32:$src1, GR32:$src2)>;
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
           (ADD8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
           (ADD16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
           (ADD32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (ADD8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (ADD16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (ADD32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
           (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
           (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (SUB8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (SUB16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (SUB32rr GR32:$src1, GR32:$src2)>;
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
           (SUB8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
           (SUB16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
           (SUB32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
           (SUB8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR16:$src1, imm:$src2),
           (SUB16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR32:$src1, imm:$src2),
           (SUB32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
           (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
           (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, GR32:$src2),
           (IMUL32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
           (IMUL16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
           (IMUL32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
           (IMUL16rri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, imm:$src2),
           (IMUL32rri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
           (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
           (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Memory-Integer Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), imm:$src2),
-                    (implicit EFLAGS)),
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
           (IMUL16rmi addr:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
           (IMUL32rmi addr:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
           (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
           (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
 
 // Optimize multiply by 2 with EFLAGS result.
 let AddedComplexity = 2 in {
-def : Pat<(parallel (X86smul_flag GR16:$src1, 2),
-                    (implicit EFLAGS)),
-          (ADD16rr GR16:$src1, GR16:$src1)>;
-
-def : Pat<(parallel (X86smul_flag GR32:$src1, 2),
-                    (implicit EFLAGS)),
-          (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>;
 }
 
-// INC and DEC with EFLAGS result. Note that these do not set CF.
-def : Pat<(parallel (X86inc_flag GR8:$src), (implicit EFLAGS)),
-          (INC8r GR8:$src)>;
-def : Pat<(parallel (X86dec_flag GR8:$src), (implicit EFLAGS)),
-          (DEC8r GR8:$src)>;
+// Patterns for nodes that do not produce flags, for instructions that do.
 
-def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
-          (INC16r GR16:$src)>, Requires<[In32BitMode]>;
-def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
-          (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+// Increment reg.
+def : Pat<(add GR8:$src ,  1), (INC8r  GR8:$src)>;
+def : Pat<(add GR16:$src,  1), (INC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src,  1), (INC32r GR32:$src)>, Requires<[In32BitMode]>;
 
-def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
-          (INC32r GR32:$src)>, Requires<[In32BitMode]>;
-def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
-          (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+// Decrement reg.
+def : Pat<(add GR8:$src , -1), (DEC8r  GR8:$src)>;
+def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
 
-// Register-Register Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (OR8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (OR16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (OR32rr GR32:$src1, GR32:$src2)>;
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
           (OR8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
           (OR16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
           (OR32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (OR8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (OR16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (OR32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
           (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
           (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (XOR8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (XOR16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (XOR32rr GR32:$src1, GR32:$src2)>;
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
           (XOR8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
           (XOR16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
           (XOR32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
           (XOR8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR16:$src1, imm:$src2),
           (XOR16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR32:$src1, imm:$src2),
           (XOR32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
           (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
           (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (AND8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (AND16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (AND32rr GR32:$src1, GR32:$src2)>;
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
           (AND8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
           (AND16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
           (AND32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
           (AND8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR16:$src1, imm:$src2),
           (AND16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR32:$src1, imm:$src2),
           (AND32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
           (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
           (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
 // -disable-16bit support.
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index e1203e2c2a37..1c81c5e981ad 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -605,22 +605,9 @@ let AddedComplexity = 10 in {
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                   VR64:$src2)),
           (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
-                  VR64:$src2)),
-          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
-                  VR64:$src2)),
-          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                   (load addr:$src2))),
           (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
-                  (load addr:$src2))),
-          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
-                  (load addr:$src2))),
-          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
 
 // Move MMX to lower 64-bit of XMM
 def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 720b663a6aa7..dadc2a663b57 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -69,8 +69,9 @@ def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
 def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
 def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
 
-def SDTX86CmpPTest : SDTypeProfile<0, 2, [SDTCisVT<0, v4f32>,
-                                          SDTCisVT<1, v4f32>]>;
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                          SDTCisVT<1, v4f32>,
+                                          SDTCisVT<2, v4f32>]>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 
 //===----------------------------------------------------------------------===//
@@ -1114,15 +1115,19 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 // load of an all-zeros value if folding it would be beneficial.
 // FIXME: Change encoding to pseudo!
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in
-def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+    isCodeGenOnly = 1 in {
+def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v2f64 immAllZerosV))]>;
+let ExeDomain = SSEPackedInt in
+def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
                  [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+}
 
-def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
-def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
-def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
-def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
 
 def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
           (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss))>;
@@ -1935,6 +1940,7 @@ let Constraints = "$src1 = $dst" in {
 
 //===---------------------------------------------------------------------===//
 // SSE integer instructions
+let ExeDomain = SSEPackedInt in {
 
 // Move Instructions
 let neverHasSideEffects = 1 in
@@ -2043,6 +2049,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 } // Constraints = "$src1 = $dst"
+} // ExeDomain = SSEPackedInt
 
 // 128-bit Integer Arithmetic
 
@@ -2105,7 +2112,8 @@ defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
 
 // 128-bit logical shifts.
-let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1,
+    ExeDomain = SSEPackedInt in {
   def PSLLDQri : PDIi8<0x73, MRM7r,
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "pslldq\t{$src2, $dst|$dst, $src2}", []>;
@@ -2139,7 +2147,7 @@ defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
 defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
 defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
 
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", ExeDomain = SSEPackedInt in {
   def PANDNrr : PDI<0xDF, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "pandn\t{$src2, $dst|$dst, $src2}",
@@ -2193,6 +2201,8 @@ defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
 defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
 defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
 
+let ExeDomain = SSEPackedInt in {
+
 // Shuffle and unpack instructions
 let AddedComplexity = 5 in {
 def PSHUFDri : PDIi8<0x70, MRMSrcReg,
@@ -2369,10 +2379,13 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
                      "maskmovdqu\t{$mask, $src|$src, $mask}",
                      [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
 
+} // ExeDomain = SSEPackedInt
+
 // Non-temporal stores
 def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                         "movntpd\t{$src, $dst|$dst, $src}",
                         [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+let ExeDomain = SSEPackedInt in
 def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                         "movntdq\t{$src, $dst|$dst, $src}",
                         [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
@@ -2386,6 +2399,7 @@ def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntpd\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
 
+let ExeDomain = SSEPackedInt in
 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
@@ -2414,7 +2428,7 @@ def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
   // FIXME: Change encoding to pseudo.
   def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
                          [(set VR128:$dst, (v4i32 immAllOnesV))]>;
@@ -3016,14 +3030,14 @@ let Predicates = [HasSSE2] in {
 let AddedComplexity = 15 in {
 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-          (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+          (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
-          (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+          (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-          (MOVSSrr (v4f32 (V_SET0)),
+          (MOVSSrr (v4f32 (V_SET0PS)),
                    (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss)))>;
 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-          (MOVSSrr (v4i32 (V_SET0)),
+          (MOVSSrr (v4i32 (V_SET0PI)),
                    (EXTRACT_SUBREG (v4i32 VR128:$src), x86_subreg_ss))>;
 }
 
@@ -3181,9 +3195,6 @@ def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
                      (SHUFFLE_get_shuf_imm VR128:$src3))>;
 
 // Set lowest element and zero upper elements.
-let AddedComplexity = 15 in
-def : Pat<(v2f64 (movl immAllZerosV_bc, VR128:$src)),
-          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
           (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
 
@@ -3441,8 +3452,28 @@ let Constraints = "$src1 = $dst" in {
                        OpSize;
   }
 }
-defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
-                                       int_x86_sse41_pmulld, 1>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+let Constraints = "$src1 = $dst" in {
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, bit Commutable = 0> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
+               OpSize {
+    let isCommutable = Commutable;
+  }
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 
+                                 (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpNode VR128:$src1,
+                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>,
+               OpSize;
+}
+}
+
+defm PMULLD         : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>;
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
 let Constraints = "$src1 = $dst" in {
@@ -3772,12 +3803,12 @@ def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                     "ptest \t{$src2, $src1|$src1, $src2}",
-                    [(X86ptest VR128:$src1, VR128:$src2),
-                      (implicit EFLAGS)]>, OpSize;
+                    [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
+              OpSize;
 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
                     "ptest \t{$src2, $src1|$src1, $src2}",
-                    [(X86ptest VR128:$src1, (load addr:$src2)),
-                        (implicit EFLAGS)]>, OpSize;
+                    [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
+              OpSize;
 }
 
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -3817,6 +3848,53 @@ def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
 def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
           (PCMPGTQrm VR128:$src1, addr:$src2)>;
 
+// TODO: These should be AES as a feature set.
+defm AESIMC          : SS42I_binop_rm_int<0xDB, "aesimc",
+                       int_x86_aesni_aesimc>;
+defm AESENC          : SS42I_binop_rm_int<0xDC, "aesenc",
+                       int_x86_aesni_aesenc>;
+defm AESENCLAST      : SS42I_binop_rm_int<0xDD, "aesenclast",
+                       int_x86_aesni_aesenclast>;
+defm AESDEC          : SS42I_binop_rm_int<0xDE, "aesdec",
+                       int_x86_aesni_aesdec>;
+defm AESDECLAST      : SS42I_binop_rm_int<0xDF, "aesdeclast",
+                       int_x86_aesni_aesdeclast>;
+
+def : Pat<(v2i64 (int_x86_aesni_aesimc VR128:$src1, VR128:$src2)),
+          (AESIMCrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesimc VR128:$src1, (memop addr:$src2))),
+          (AESIMCrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
+          (AESENCrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
+          (AESENCrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
+          (AESENCLASTrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
+          (AESENCLASTrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
+          (AESDECrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
+          (AESDECrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
+          (AESDECLASTrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
+          (AESDECLASTrm VR128:$src1, addr:$src2)>;
+
+def AESKEYGENASSIST128rr : SS42AI<0xDF, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1, i32i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+  OpSize;
+def AESKEYGENASSIST128rm : SS42AI<0xDF, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1, i32i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
+                                    imm:$src2))]>,
+  OpSize;
+
 // crc intrinsic instruction
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index cd568164b251..8a0cde49aea6 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -266,6 +266,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     unsigned Model  = 0;
     DetectFamilyModel(EAX, Family, Model);
     IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
+    // If it's Nehalem, unaligned memory access is fast.
+    if (Family == 15 && Model == 26)
+      IsUAMemFast = true;
 
     GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
     HasX86_64 = (EDX >> 29) & 0x1;
@@ -286,6 +289,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
   , HasFMA3(false)
   , HasFMA4(false)
   , IsBTMemSlow(false)
+  , IsUAMemFast(false)
   , HasVectorUAMem(false)
   , DarwinVers(0)
   , stackAlignment(8)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 56220db3b215..bf30154625bf 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -78,6 +78,9 @@ protected:
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
+  /// IsUAMemFast - True if unaligned memory access is fast.
+  bool IsUAMemFast;
+
   /// HasVectorUAMem - True if SIMD operations can have unaligned memory
   ///                  operands. This may require setting a feature bit in the
   ///                  processor.
@@ -148,6 +151,7 @@ public:
   bool hasFMA3() const { return HasFMA3; }
   bool hasFMA4() const { return HasFMA4; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
 
   bool isTargetDarwin() const { return TargetType == isDarwin; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index f13e6f352564..c608e56c8fbf 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
@@ -169,6 +170,15 @@ bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
   return true;  // -print-machineinstr should print after this.
 }
 
+bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  if (OptLevel != CodeGenOpt::None && Subtarget.hasSSE2()) {
+    PM.add(createSSEDomainFixPass());
+    return true;
+  }
+  return false;
+}
+
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 2bb54544d408..ae7b5b29af14 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -66,6 +66,7 @@ public:
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
                               JITCodeEmitter &JCE);
 };
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index e5f5a6dbd38a..54df33c50647 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -215,7 +215,15 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                               bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -326,6 +334,11 @@ XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
     return 0;
   
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 2e9a1e5d6e8b..dd3cbc169908 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -32,7 +32,7 @@ def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
                             [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag,
                              SDNPVariadic]>;
 
-def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTNone,
+def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTBrind,
                          [SDNPHasChain, SDNPOptInFlag]>;
 
 def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 7cb13671bca5..40a87e880d7b 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -623,6 +623,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   SmallVector<Value*, 16> Args;
   while (!F->use_empty()) {
     CallSite CS = CallSite::get(F->use_back());
+    assert(CS.getCalledFunction() == F);
     Instruction *Call = CS.getInstruction();
     const AttrListPtr &CallPAL = CS.getAttributes();
 
@@ -660,7 +661,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         // Non-dead argument: insert GEPs and loads as appropriate.
         ScalarizeTable &ArgIndices = ScalarizedElements[I];
         // Store the Value* version of the indices in here, but declare it now
-        // for reuse
+        // for reuse.
         std::vector<Value*> Ops;
         for (ScalarizeTable::iterator SI = ArgIndices.begin(),
                E = ArgIndices.end(); SI != E; ++SI) {
@@ -677,16 +678,20 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
                     Type::getInt32Ty(F->getContext()) : 
                     Type::getInt64Ty(F->getContext()));
               Ops.push_back(ConstantInt::get(IdxTy, *II));
-              // Keep track of the type we're currently indexing
+              // Keep track of the type we're currently indexing.
               ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
             }
-            // And create a GEP to extract those indices
+            // And create a GEP to extract those indices.
             V = GetElementPtrInst::Create(V, Ops.begin(), Ops.end(),
                                           V->getName()+".idx", Call);
             Ops.clear();
             AA.copyValue(OrigLoad->getOperand(0), V);
           }
-          Args.push_back(new LoadInst(V, V->getName()+".val", Call));
+          // Since we're replacing a load make sure we take the alignment
+          // of the previous load.
+          LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call);
+          newLoad->setAlignment(OrigLoad->getAlignment());
+          Args.push_back(newLoad);
           AA.copyValue(OrigLoad, Args.back());
         }
       }
@@ -694,7 +699,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     if (ExtraArgHack)
       Args.push_back(Constant::getNullValue(Type::getInt32Ty(F->getContext())));
 
-    // Push any varargs arguments on the list
+    // Push any varargs arguments on the list.
     for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
       Args.push_back(*AI);
       if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex))
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index f386ed78b5f5..227602d19f56 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -50,7 +50,7 @@ namespace {
     /// argument.  Used so that arguments and return values can be used
     /// interchangably.
     struct RetOrArg {
-      RetOrArg(const Function* F, unsigned Idx, bool IsArg) : F(F), Idx(Idx),
+      RetOrArg(const Function *F, unsigned Idx, bool IsArg) : F(F), Idx(Idx),
                IsArg(IsArg) {}
       const Function *F;
       unsigned Idx;
@@ -72,7 +72,7 @@ namespace {
       }
 
       std::string getDescription() const {
-        return std::string((IsArg ? "Argument #" : "Return value #")) 
+        return std::string((IsArg ? "Argument #" : "Return value #"))
                + utostr(Idx) + " of function " + F->getNameStr();
       }
     };
@@ -129,11 +129,11 @@ namespace {
 
   private:
     Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
-    Liveness SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
+    Liveness SurveyUse(Value::const_use_iterator U, UseVector &MaybeLiveUses,
                        unsigned RetValNum = 0);
-    Liveness SurveyUses(Value *V, UseVector &MaybeLiveUses);
+    Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
 
-    void SurveyFunction(Function &F);
+    void SurveyFunction(const Function &F);
     void MarkValue(const RetOrArg &RA, Liveness L,
                    const UseVector &MaybeLiveUses);
     void MarkLive(const RetOrArg &RA);
@@ -196,7 +196,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but doesn't have isVarArg set.
   const FunctionType *FTy = Fn.getFunctionType();
-  
+
   std::vector<const Type*> Params(FTy->param_begin(), FTy->param_end());
   FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
                                                 Params, false);
@@ -225,7 +225,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
       SmallVector<AttributeWithIndex, 8> AttributesVec;
       for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
         AttributesVec.push_back(PAL.getSlot(i));
-      if (Attributes FnAttrs = PAL.getFnAttributes()) 
+      if (Attributes FnAttrs = PAL.getFnAttributes())
         AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
       PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
     }
@@ -280,7 +280,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 /// for void functions and 1 for functions not returning a struct. It returns
 /// the number of struct elements for functions returning a struct.
 static unsigned NumRetVals(const Function *F) {
-  if (F->getReturnType() == Type::getVoidTy(F->getContext()))
+  if (F->getReturnType()->isVoidTy())
     return 0;
   else if (const StructType *STy = dyn_cast<StructType>(F->getReturnType()))
     return STy->getNumElements();
@@ -305,15 +305,15 @@ DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) {
 
 /// SurveyUse - This looks at a single use of an argument or return value
 /// and determines if it should be alive or not. Adds this use to MaybeLiveUses
-/// if it causes the used value to become MaybeAlive.
+/// if it causes the used value to become MaybeLive.
 ///
 /// RetValNum is the return value number to use when this use is used in a
 /// return instruction. This is used in the recursion, you should always leave
 /// it at 0.
-DAE::Liveness DAE::SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
-                             unsigned RetValNum) {
-    Value *V = *U;
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U,
+                             UseVector &MaybeLiveUses, unsigned RetValNum) {
+    const User *V = *U;
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
       // The value is returned from a function. It's only live when the
       // function's return value is live. We use RetValNum here, for the case
       // that U is really a use of an insertvalue instruction that uses the
@@ -322,7 +322,7 @@ DAE::Liveness DAE::SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
       // We might be live, depending on the liveness of Use.
       return MarkIfNotLive(Use, MaybeLiveUses);
     }
-    if (InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+    if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
       if (U.getOperandNo() != InsertValueInst::getAggregateOperandIndex()
           && IV->hasIndices())
         // The use we are examining is inserted into an aggregate. Our liveness
@@ -334,7 +334,7 @@ DAE::Liveness DAE::SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
       // we don't change RetValNum, but do survey all our uses.
 
       Liveness Result = MaybeLive;
-      for (Value::use_iterator I = IV->use_begin(),
+      for (Value::const_use_iterator I = IV->use_begin(),
            E = V->use_end(); I != E; ++I) {
         Result = SurveyUse(I, MaybeLiveUses, RetValNum);
         if (Result == Live)
@@ -342,24 +342,24 @@ DAE::Liveness DAE::SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
       }
       return Result;
     }
-    CallSite CS = CallSite::get(V);
-    if (CS.getInstruction()) {
-      Function *F = CS.getCalledFunction();
+
+    if (ImmutableCallSite CS = V) {
+      const Function *F = CS.getCalledFunction();
       if (F) {
         // Used in a direct call.
-  
+
         // Find the argument number. We know for sure that this use is an
         // argument, since if it was the function argument this would be an
         // indirect call and the we know can't be looking at a value of the
         // label type (for the invoke instruction).
-        unsigned ArgNo = CS.getArgumentNo(U.getOperandNo());
+        unsigned ArgNo = CS.getArgumentNo(U);
 
         if (ArgNo >= F->getFunctionType()->getNumParams())
           // The value is passed in through a vararg! Must be live.
           return Live;
 
-        assert(CS.getArgument(ArgNo) 
-               == CS.getInstruction()->getOperand(U.getOperandNo()) 
+        assert(CS.getArgument(ArgNo)
+               == CS->getOperand(U.getOperandNo())
                && "Argument is not where we expected it");
 
         // Value passed to a normal call. It's only live when the corresponding
@@ -378,11 +378,11 @@ DAE::Liveness DAE::SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
 /// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
 /// the result is Live, MaybeLiveUses might be modified but its content should
 /// be ignored (since it might not be complete).
-DAE::Liveness DAE::SurveyUses(Value *V, UseVector &MaybeLiveUses) {
+DAE::Liveness DAE::SurveyUses(const Value *V, UseVector &MaybeLiveUses) {
   // Assume it's dead (which will only hold if there are no uses at all..).
   Liveness Result = MaybeLive;
   // Check each use.
-  for (Value::use_iterator I = V->use_begin(),
+  for (Value::const_use_iterator I = V->use_begin(),
        E = V->use_end(); I != E; ++I) {
     Result = SurveyUse(I, MaybeLiveUses);
     if (Result == Live)
@@ -399,7 +399,7 @@ DAE::Liveness DAE::SurveyUses(Value *V, UseVector &MaybeLiveUses) {
 // We consider arguments of non-internal functions to be intrinsically alive as
 // well as arguments to functions which have their "address taken".
 //
-void DAE::SurveyFunction(Function &F) {
+void DAE::SurveyFunction(const Function &F) {
   unsigned RetCount = NumRetVals(&F);
   // Assume all return values are dead
   typedef SmallVector<Liveness, 5> RetVals;
@@ -411,8 +411,8 @@ void DAE::SurveyFunction(Function &F) {
   // MaybeLive. Initialized to a list of RetCount empty lists.
   RetUses MaybeLiveRetUses(RetCount);
 
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
       if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
           != F.getFunctionType()->getReturnType()) {
         // We don't support old style multiple return values.
@@ -431,17 +431,18 @@ void DAE::SurveyFunction(Function &F) {
   unsigned NumLiveRetVals = 0;
   const Type *STy = dyn_cast<StructType>(F.getReturnType());
   // Loop all uses of the function.
-  for (Value::use_iterator I = F.use_begin(), E = F.use_end(); I != E; ++I) {
+  for (Value::const_use_iterator I = F.use_begin(), E = F.use_end();
+       I != E; ++I) {
     // If the function is PASSED IN as an argument, its address has been
     // taken.
-    CallSite CS = CallSite::get(*I);
-    if (!CS.getInstruction() || !CS.isCallee(I)) {
+    ImmutableCallSite CS(*I);
+    if (!CS || !CS.isCallee(I)) {
       MarkLive(F);
       return;
     }
 
     // If this use is anything other than a call site, the function is alive.
-    Instruction *TheCall = CS.getInstruction();
+    const Instruction *TheCall = CS.getInstruction();
     if (!TheCall) {   // Not a direct call site?
       MarkLive(F);
       return;
@@ -454,9 +455,9 @@ void DAE::SurveyFunction(Function &F) {
     if (NumLiveRetVals != RetCount) {
       if (STy) {
         // Check all uses of the return value.
-        for (Value::use_iterator I = TheCall->use_begin(),
+        for (Value::const_use_iterator I = TheCall->use_begin(),
              E = TheCall->use_end(); I != E; ++I) {
-          ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(*I);
+          const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(*I);
           if (Ext && Ext->hasIndices()) {
             // This use uses a part of our return value, survey the uses of
             // that part and store the results for this index only.
@@ -493,7 +494,7 @@ void DAE::SurveyFunction(Function &F) {
   // Now, check all of our arguments.
   unsigned i = 0;
   UseVector MaybeLiveArgUses;
-  for (Function::arg_iterator AI = F.arg_begin(),
+  for (Function::const_arg_iterator AI = F.arg_begin(),
        E = F.arg_end(); AI != E; ++AI, ++i) {
     // See what the effect of this use is (recording any uses that cause
     // MaybeLive in MaybeLiveArgUses).
@@ -599,12 +600,12 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   const Type *RetTy = FTy->getReturnType();
   const Type *NRetTy = NULL;
   unsigned RetCount = NumRetVals(F);
-  
+
   // -1 means unused, other numbers are the new index
   SmallVector<int, 5> NewRetIdxs(RetCount, -1);
   std::vector<const Type*> RetTypes;
-  if (RetTy == Type::getVoidTy(F->getContext())) {
-    NRetTy = Type::getVoidTy(F->getContext());
+  if (RetTy->isVoidTy()) {
+    NRetTy = RetTy;
   } else {
     const StructType *STy = dyn_cast<StructType>(RetTy);
     if (STy)
@@ -653,10 +654,10 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // values. Otherwise, ensure that we don't have any conflicting attributes
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
-  if (NRetTy == Type::getVoidTy(F->getContext()))
+  if (NRetTy->isVoidTy())
     RAttrs &= ~Attribute::typeIncompatible(NRetTy);
   else
-    assert((RAttrs & Attribute::typeIncompatible(NRetTy)) == 0 
+    assert((RAttrs & Attribute::typeIncompatible(NRetTy)) == 0
            && "Return attributes no longer compatible?");
 
   if (RAttrs)
@@ -686,11 +687,12 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     }
   }
 
-  if (FnAttrs != Attribute::None) 
+  if (FnAttrs != Attribute::None)
     AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
+  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(),
+                                        AttributesVec.end());
 
   // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
   // have zero fixed arguments.
@@ -705,8 +707,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   }
 
   // Create the new function type based on the recomputed parameters.
-  FunctionType *NFTy = FunctionType::get(NRetTy, Params,
-                                                FTy->isVarArg());
+  FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
 
   // No change?
   if (NFTy == FTy)
@@ -791,7 +792,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         // Return type not changed? Just replace users then.
         Call->replaceAllUsesWith(New);
         New->takeName(Call);
-      } else if (New->getType() == Type::getVoidTy(F->getContext())) {
+      } else if (New->getType()->isVoidTy()) {
         // Our return value has uses, but they will get removed later on.
         // Replace by null for now.
         Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
@@ -805,7 +806,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
           while (isa<PHINode>(IP)) ++IP;
           InsertPt = IP;
         }
-          
+
         // We used to return a struct. Instead of doing smart stuff with all the
         // uses of this struct, we will just rebuild it using
         // extract/insertvalue chaining and let instcombine clean that up.
@@ -929,11 +930,11 @@ bool DAE::runOnModule(Module &M) {
   DEBUG(dbgs() << "DAE - Determining liveness\n");
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
     SurveyFunction(*I);
-  
+
   // Now, remove all dead arguments and return values from each function in
-  // turn
+  // turn.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    // Increment now, because the function will probably get removed (ie
+    // Increment now, because the function will probably get removed (ie.
     // replaced by a new one).
     Function *F = I++;
     Changed |= RemoveDeadStuffFromFunction(F);
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index d8e97a26ada8..ddff5ef8b36c 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -119,7 +119,7 @@ struct GlobalStatus {
   /// null/false.  When the first accessing function is noticed, it is recorded.
   /// When a second different accessing function is noticed,
   /// HasMultipleAccessingFunctions is set to true.
-  Function *AccessingFunction;
+  const Function *AccessingFunction;
   bool HasMultipleAccessingFunctions;
 
   /// HasNonInstructionUser - Set to true if this global has a user that is not
@@ -140,11 +140,11 @@ struct GlobalStatus {
 // by constants itself.  Note that constants cannot be cyclic, so this test is
 // pretty easy to implement recursively.
 //
-static bool SafeToDestroyConstant(Constant *C) {
+static bool SafeToDestroyConstant(const Constant *C) {
   if (isa<GlobalValue>(C)) return false;
 
-  for (Value::use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; ++UI)
-    if (Constant *CU = dyn_cast<Constant>(*UI)) {
+  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; ++UI)
+    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
       if (!SafeToDestroyConstant(CU)) return false;
     } else
       return false;
@@ -156,26 +156,26 @@ static bool SafeToDestroyConstant(Constant *C) {
 /// structure.  If the global has its address taken, return true to indicate we
 /// can't do anything with it.
 ///
-static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
-                          SmallPtrSet<PHINode*, 16> &PHIUsers) {
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
+static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
+                          SmallPtrSet<const PHINode*, 16> &PHIUsers) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
       GS.HasNonInstructionUser = true;
 
       if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
 
-    } else if (Instruction *I = dyn_cast<Instruction>(*UI)) {
+    } else if (const Instruction *I = dyn_cast<Instruction>(*UI)) {
       if (!GS.HasMultipleAccessingFunctions) {
-        Function *F = I->getParent()->getParent();
+        const Function *F = I->getParent()->getParent();
         if (GS.AccessingFunction == 0)
           GS.AccessingFunction = F;
         else if (GS.AccessingFunction != F)
           GS.HasMultipleAccessingFunctions = true;
       }
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
         GS.isLoaded = true;
         if (LI->isVolatile()) return true;  // Don't hack on volatile loads.
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // Don't allow a store OF the address, only stores TO the address.
         if (SI->getOperand(0) == V) return true;
 
@@ -185,14 +185,13 @@ static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
         // value, not an aggregate), keep more specific information about
         // stores.
         if (GS.StoredType != GlobalStatus::isStored) {
-          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(SI->getOperand(1))){
+          if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(SI->getOperand(1))){
             Value *StoredVal = SI->getOperand(0);
             if (StoredVal == GV->getInitializer()) {
               if (GS.StoredType < GlobalStatus::isInitializerStored)
                 GS.StoredType = GlobalStatus::isInitializerStored;
             } else if (isa<LoadInst>(StoredVal) &&
                        cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
-              // G = G
               if (GS.StoredType < GlobalStatus::isInitializerStored)
                 GS.StoredType = GlobalStatus::isInitializerStored;
             } else if (GS.StoredType < GlobalStatus::isStoredOnce) {
@@ -212,7 +211,7 @@ static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
         if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
       } else if (isa<SelectInst>(I)) {
         if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
         // PHI nodes we can check just like select or GEP instructions, but we
         // have to be careful about infinite recursion.
         if (PHIUsers.insert(PN))  // Not already visited.
@@ -230,7 +229,7 @@ static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
       } else {
         return true;  // Any other non-load instruction might take address!
       }
-    } else if (Constant *C = dyn_cast<Constant>(*UI)) {
+    } else if (const Constant *C = dyn_cast<Constant>(*UI)) {
       GS.HasNonInstructionUser = true;
       // We might have a dead and dangling constant hanging off of here.
       if (!SafeToDestroyConstant(C))
@@ -1029,23 +1028,23 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
 /// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi
 /// of a load) are simple enough to perform heap SRA on.  This permits GEP's
 /// that index through the array and struct field, icmps of null, and PHIs.
-static bool LoadUsesSimpleEnoughForHeapSRA(Value *V,
-                              SmallPtrSet<PHINode*, 32> &LoadUsingPHIs,
-                              SmallPtrSet<PHINode*, 32> &LoadUsingPHIsPerLoad) {
+static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
+                              SmallPtrSet<const PHINode*, 32> &LoadUsingPHIs,
+                              SmallPtrSet<const PHINode*, 32> &LoadUsingPHIsPerLoad) {
   // We permit two users of the load: setcc comparing against the null
   // pointer, and a getelementptr of a specific form.
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
-    Instruction *User = cast<Instruction>(*UI);
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    const Instruction *User = cast<Instruction>(*UI);
     
     // Comparison against null is ok.
-    if (ICmpInst *ICI = dyn_cast<ICmpInst>(User)) {
+    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(User)) {
       if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
         return false;
       continue;
     }
     
     // getelementptr is also ok, but only a simple form.
-    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+    if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
       // Must index into the array and into the struct.
       if (GEPI->getNumOperands() < 3)
         return false;
@@ -1054,7 +1053,7 @@ static bool LoadUsesSimpleEnoughForHeapSRA(Value *V,
       continue;
     }
     
-    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+    if (const PHINode *PN = dyn_cast<PHINode>(User)) {
       if (!LoadUsingPHIsPerLoad.insert(PN))
         // This means some phi nodes are dependent on each other.
         // Avoid infinite looping!
@@ -1081,13 +1080,13 @@ static bool LoadUsesSimpleEnoughForHeapSRA(Value *V,
 
 /// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from
 /// GV are simple enough to perform HeapSRA, return true.
-static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(GlobalVariable *GV,
+static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
                                                     Instruction *StoredVal) {
-  SmallPtrSet<PHINode*, 32> LoadUsingPHIs;
-  SmallPtrSet<PHINode*, 32> LoadUsingPHIsPerLoad;
-  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E; 
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad;
+  for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E; 
        ++UI)
-    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+    if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
       if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
                                           LoadUsingPHIsPerLoad))
         return false;
@@ -1099,16 +1098,16 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(GlobalVariable *GV,
   // that all inputs the to the PHI nodes are in the same equivalence sets. 
   // Check to verify that all operands of the PHIs are either PHIS that can be
   // transformed, loads from GV, or MI itself.
-  for (SmallPtrSet<PHINode*, 32>::iterator I = LoadUsingPHIs.begin(),
+  for (SmallPtrSet<const PHINode*, 32>::const_iterator I = LoadUsingPHIs.begin(),
        E = LoadUsingPHIs.end(); I != E; ++I) {
-    PHINode *PN = *I;
+    const PHINode *PN = *I;
     for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
       Value *InVal = PN->getIncomingValue(op);
       
       // PHI of the stored value itself is ok.
       if (InVal == StoredVal) continue;
       
-      if (PHINode *InPN = dyn_cast<PHINode>(InVal)) {
+      if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) {
         // One of the PHIs in our set is (optimistically) ok.
         if (LoadUsingPHIs.count(InPN))
           continue;
@@ -1116,7 +1115,7 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(GlobalVariable *GV,
       }
       
       // Load from GV is ok.
-      if (LoadInst *LI = dyn_cast<LoadInst>(InVal))
+      if (const LoadInst *LI = dyn_cast<LoadInst>(InVal))
         if (LI->getOperand(0) == GV)
           continue;
       
@@ -1664,7 +1663,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
 /// it if possible.  If we make a change, return true.
 bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                       Module::global_iterator &GVI) {
-  SmallPtrSet<PHINode*, 16> PHIUsers;
+  SmallPtrSet<const PHINode*, 16> PHIUsers;
   GlobalStatus GS;
   GV->removeDeadConstantUsers();
 
@@ -1715,12 +1714,13 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
         GS.AccessingFunction->hasExternalLinkage() &&
         GV->getType()->getAddressSpace() == 0) {
       DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV);
-      Instruction* FirstI = GS.AccessingFunction->getEntryBlock().begin();
+      Instruction& FirstI = const_cast<Instruction&>(*GS.AccessingFunction
+                                                     ->getEntryBlock().begin());
       const Type* ElemTy = GV->getType()->getElementType();
       // FIXME: Pass Global's alignment when globals have alignment
-      AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), FirstI);
+      AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI);
       if (!isa<UndefValue>(GV->getInitializer()))
-        new StoreInst(GV->getInitializer(), Alloca, FirstI);
+        new StoreInst(GV->getInitializer(), Alloca, &FirstI);
 
       GV->replaceAllUsesWith(Alloca);
       GV->eraseFromParent();
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 3ae771c55f02..161246bc2598 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -168,7 +168,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
       if (II->doesNotThrow()) {
-        SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
         // Insert a call instruction before the invoke.
         CallInst *Call = CallInst::Create(II->getCalledValue(),
                                           Args.begin(), Args.end(), "", II);
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 65f2e15d2780..76c815da8628 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -766,7 +766,7 @@ protected:
         return SizeCI->getZExtValue() >=
                GetStringLength(CI->getOperand(SizeArgOp));
       if (ConstantInt *Arg = dyn_cast<ConstantInt>(CI->getOperand(SizeArgOp)))
-        return SizeCI->getZExtValue() <= Arg->getZExtValue();
+        return SizeCI->getZExtValue() >= Arg->getZExtValue();
     }
     return false;
   }
diff --git a/lib/Transforms/Scalar/ABCD.cpp b/lib/Transforms/Scalar/ABCD.cpp
index ea8e5c3e53f2..6135992a2f88 100644
--- a/lib/Transforms/Scalar/ABCD.cpp
+++ b/lib/Transforms/Scalar/ABCD.cpp
@@ -27,6 +27,7 @@
 
 #define DEBUG_TYPE "abcd"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Constants.h"
@@ -77,10 +78,10 @@ class ABCD : public FunctionPass {
   class Bound {
    public:
     Bound(APInt v, bool upper) : value(v), upper_bound(upper) {}
-    Bound(const Bound *b, int cnst)
-      : value(b->value - cnst), upper_bound(b->upper_bound) {}
-    Bound(const Bound *b, const APInt &cnst)
-      : value(b->value - cnst), upper_bound(b->upper_bound) {}
+    Bound(const Bound &b, int cnst)
+      : value(b.value - cnst), upper_bound(b.upper_bound) {}
+    Bound(const Bound &b, const APInt &cnst)
+      : value(b.value - cnst), upper_bound(b.upper_bound) {}
 
     /// Test if Bound is an upper bound
     bool isUpperBound() const { return upper_bound; }
@@ -89,15 +90,15 @@ class ABCD : public FunctionPass {
     int32_t getBitWidth() const { return value.getBitWidth(); }
 
     /// Creates a Bound incrementing the one received
-    static Bound *createIncrement(const Bound *b) {
-      return new Bound(b->isUpperBound() ? b->value+1 : b->value-1,
-                       b->upper_bound);
+    static Bound createIncrement(const Bound &b) {
+      return Bound(b.isUpperBound() ? b.value+1 : b.value-1,
+                   b.upper_bound);
     }
 
     /// Creates a Bound decrementing the one received
-    static Bound *createDecrement(const Bound *b) {
-      return new Bound(b->isUpperBound() ? b->value-1 : b->value+1,
-                       b->upper_bound);
+    static Bound createDecrement(const Bound &b) {
+      return Bound(b.isUpperBound() ? b.value-1 : b.value+1,
+                   b.upper_bound);
     }
 
     /// Test if two bounds are equal
@@ -109,36 +110,31 @@ class ABCD : public FunctionPass {
     }
 
     /// Test if val is less than or equal to Bound b
-    static bool leq(APInt val, const Bound *b) {
-      if (!b) return false;
-      return b->isUpperBound() ? val.sle(b->value) : val.sge(b->value);
+    static bool leq(APInt val, const Bound &b) {
+      return b.isUpperBound() ? val.sle(b.value) : val.sge(b.value);
     }
 
     /// Test if Bound a is less then or equal to Bound
-    static bool leq(const Bound *a, const Bound *b) {
-      if (!a || !b) return false;
-
-      assert(a->isUpperBound() == b->isUpperBound());
-      return a->isUpperBound() ? a->value.sle(b->value) :
-                                 a->value.sge(b->value);
+    static bool leq(const Bound &a, const Bound &b) {
+      assert(a.isUpperBound() == b.isUpperBound());
+      return a.isUpperBound() ? a.value.sle(b.value) :
+                                a.value.sge(b.value);
     }
 
     /// Test if Bound a is less then Bound b
-    static bool lt(const Bound *a, const Bound *b) {
-      if (!a || !b) return false;
-
-      assert(a->isUpperBound() == b->isUpperBound());
-      return a->isUpperBound() ? a->value.slt(b->value) :
-                                 a->value.sgt(b->value);
+    static bool lt(const Bound &a, const Bound &b) {
+      assert(a.isUpperBound() == b.isUpperBound());
+      return a.isUpperBound() ? a.value.slt(b.value) :
+                                a.value.sgt(b.value);
     }
 
     /// Test if Bound b is greater then or equal val
-    static bool geq(const Bound *b, APInt val) {
+    static bool geq(const Bound &b, APInt val) {
       return leq(val, b);
     }
 
     /// Test if Bound a is greater then or equal Bound b
-    static bool geq(const Bound *a, const Bound *b) {
+    static bool geq(const Bound &a, const Bound &b) {
       return leq(b, a);
     }
 
@@ -152,29 +148,36 @@ class ABCD : public FunctionPass {
   /// minimum true and minimum reduced results are stored
   class MemoizedResultChart {
    public:
-     MemoizedResultChart()
-       : max_false(NULL), min_true(NULL), min_reduced(NULL) {}
+     MemoizedResultChart() {}
+     MemoizedResultChart(const MemoizedResultChart &other) {
+       if (other.max_false)
+         max_false.reset(new Bound(*other.max_false));
+       if (other.min_true)
+         min_true.reset(new Bound(*other.min_true));
+       if (other.min_reduced)
+         min_reduced.reset(new Bound(*other.min_reduced));
+     }
 
     /// Returns the max false
-    Bound *getFalse() const { return max_false; }
+    const Bound *getFalse() const { return max_false.get(); }
 
     /// Returns the min true
-    Bound *getTrue() const { return min_true; }
+    const Bound *getTrue() const { return min_true.get(); }
 
     /// Returns the min reduced
-    Bound *getReduced() const { return min_reduced; }
+    const Bound *getReduced() const { return min_reduced.get(); }
 
     /// Return the stored result for this bound
-    ProveResult getResult(const Bound *bound) const;
+    ProveResult getResult(const Bound &bound) const;
 
     /// Stores a false found
-    void addFalse(Bound *bound);
+    void addFalse(const Bound &bound);
 
     /// Stores a true found
-    void addTrue(Bound *bound);
+    void addTrue(const Bound &bound);
 
     /// Stores a Reduced found
-    void addReduced(Bound *bound);
+    void addReduced(const Bound &bound);
 
     /// Clears redundant reduced
     /// If a min_true is smaller than a min_reduced then the min_reduced
@@ -183,13 +186,13 @@ class ABCD : public FunctionPass {
     void clearRedundantReduced();
 
     void clear() {
-      delete max_false;
-      delete min_true;
-      delete min_reduced;
+      max_false.reset();
+      min_true.reset();
+      min_reduced.reset();
     }
 
   private:
-    Bound *max_false, *min_true, *min_reduced;
+    OwningPtr<Bound> max_false, min_true, min_reduced;
   };
 
   /// This class stores the result found for a node of the graph,
@@ -198,27 +201,27 @@ class ABCD : public FunctionPass {
   public:
     /// Test if there is true result stored from b to a
     /// that is less then the bound
-    bool hasTrue(Value *b, const Bound *bound) const {
-      Bound *trueBound = map.lookup(b).getTrue();
-      return trueBound && Bound::leq(trueBound, bound);
+    bool hasTrue(Value *b, const Bound &bound) const {
+      const Bound *trueBound = map.lookup(b).getTrue();
+      return trueBound && Bound::leq(*trueBound, bound);
     }
 
     /// Test if there is false result stored from b to a
     /// that is less then the bound
-    bool hasFalse(Value *b, const Bound *bound) const {
-      Bound *falseBound = map.lookup(b).getFalse();
-      return falseBound && Bound::leq(falseBound, bound);
+    bool hasFalse(Value *b, const Bound &bound) const {
+      const Bound *falseBound = map.lookup(b).getFalse();
+      return falseBound && Bound::leq(*falseBound, bound);
     }
 
     /// Test if there is reduced result stored from b to a
     /// that is less then the bound
-    bool hasReduced(Value *b, const Bound *bound) const {
-      Bound *reducedBound = map.lookup(b).getReduced();
-      return reducedBound && Bound::leq(reducedBound, bound);
+    bool hasReduced(Value *b, const Bound &bound) const {
+      const Bound *reducedBound = map.lookup(b).getReduced();
+      return reducedBound && Bound::leq(*reducedBound, bound);
     }
 
     /// Returns the stored bound for b
-    ProveResult getBoundResult(Value *b, Bound *bound) {
+    ProveResult getBoundResult(Value *b, const Bound &bound) {
       return map[b].getResult(bound);
     }
 
@@ -233,7 +236,7 @@ class ABCD : public FunctionPass {
     }
 
     /// Stores the bound found
-    void updateBound(Value *b, Bound *bound, const ProveResult res);
+    void updateBound(Value *b, const Bound &bound, const ProveResult res);
 
   private:
     // Maps a nod in the graph with its results found.
@@ -274,7 +277,7 @@ class ABCD : public FunctionPass {
     bool hasEdge(Value *V, bool upper) const;
 
     /// Returns all edges pointed by vertex V
-    SmallPtrSet<Edge *, 16> getEdges(Value *V) const {
+    SmallVector<Edge, 16> getEdges(Value *V) const {
       return graph.lookup(V);
     }
 
@@ -292,13 +295,7 @@ class ABCD : public FunctionPass {
     }
 
   private:
-    DenseMap<Value *, SmallPtrSet<Edge *, 16> > graph;
-
-    /// Adds a Node to the graph.
-    DenseMap<Value *, SmallPtrSet<Edge *, 16> >::iterator addNode(Value *V) {
-      SmallPtrSet<Edge *, 16> p;
-      return graph.insert(std::make_pair(V, p)).first;
-    }
+    DenseMap<Value *, SmallVector<Edge, 16> > graph;
 
     /// Prints the header of the dot file
     void printHeader(raw_ostream &OS, Function &F) const;
@@ -315,7 +312,7 @@ class ABCD : public FunctionPass {
     void printVertex(raw_ostream &OS, Value *source) const;
 
     /// Prints the edge to the dot file
-    void printEdge(raw_ostream &OS, Value *source, Edge *edge) const;
+    void printEdge(raw_ostream &OS, Value *source, const Edge &edge) const;
 
     void printName(raw_ostream &OS, Value *info) const;
   };
@@ -428,15 +425,15 @@ class ABCD : public FunctionPass {
   bool demandProve(Value *a, Value *b, int c, bool upper_bound);
 
   /// Prove that distance between b and a is <= bound
-  ProveResult prove(Value *a, Value *b, Bound *bound, unsigned level);
+  ProveResult prove(Value *a, Value *b, const Bound &bound, unsigned level);
 
   /// Updates the distance value for a and b
-  void updateMemDistance(Value *a, Value *b, Bound *bound, unsigned level,
+  void updateMemDistance(Value *a, Value *b, const Bound &bound, unsigned level,
                          meet_function meet);
 
   InequalityGraph inequality_graph;
   MemoizedResult mem_result;
-  DenseMap<Value*, Bound*> active;
+  DenseMap<Value*, const Bound*> active;
   SmallPtrSet<Value*, 16> created;
   SmallVector<PHINode *, 16> phis_to_remove;
 };
@@ -857,7 +854,7 @@ PHINode *ABCD::findSigma(BasicBlock *BB, Instruction *I) {
 /// This implementation works on any kind of inequality branch.
 bool ABCD::demandProve(Value *a, Value *b, int c, bool upper_bound) {
   int32_t width = cast<IntegerType>(a->getType())->getBitWidth();
-  Bound *bound = new Bound(APInt(width, c), upper_bound);
+  Bound bound(APInt(width, c), upper_bound);
 
   mem_result.clear();
   active.clear();
@@ -867,7 +864,7 @@ bool ABCD::demandProve(Value *a, Value *b, int c, bool upper_bound) {
 }
 
 /// Prove that distance between b and a is <= bound
-ABCD::ProveResult ABCD::prove(Value *a, Value *b, Bound *bound,
+ABCD::ProveResult ABCD::prove(Value *a, Value *b, const Bound &bound,
                               unsigned level) {
   // if (C[b-a<=e] == True for some e <= bound
   // Same or stronger difference was already proven
@@ -885,22 +882,22 @@ ABCD::ProveResult ABCD::prove(Value *a, Value *b, Bound *bound,
     return Reduced;
 
   // traversal reached the source vertex
-  if (a == b && Bound::geq(bound, APInt(bound->getBitWidth(), 0, true)))
+  if (a == b && Bound::geq(bound, APInt(bound.getBitWidth(), 0, true)))
     return True;
 
   // if b has no predecessor then fail
-  if (!inequality_graph.hasEdge(b, bound->isUpperBound()))
+  if (!inequality_graph.hasEdge(b, bound.isUpperBound()))
     return False;
 
   // a cycle was encountered
   if (active.count(b)) {
-    if (Bound::leq(active.lookup(b), bound))
+    if (Bound::leq(*active.lookup(b), bound))
       return Reduced; // a "harmless" cycle
 
     return False; // an amplifying cycle
   }
 
-  active[b] = bound;
+  active[b] = &bound;
   PHINode *PN = dyn_cast<PHINode>(b);
 
   // Test if a Value is a Phi. If it is a PHINode with more than 1 incoming
@@ -917,23 +914,23 @@ ABCD::ProveResult ABCD::prove(Value *a, Value *b, Bound *bound,
 }
 
 /// Updates the distance value for a and b
-void ABCD::updateMemDistance(Value *a, Value *b, Bound *bound, unsigned level,
-                             meet_function meet) {
+void ABCD::updateMemDistance(Value *a, Value *b, const Bound &bound,
+                             unsigned level, meet_function meet) {
   ABCD::ProveResult res = (meet == max) ? False : True;
 
-  SmallPtrSet<Edge *, 16> Edges = inequality_graph.getEdges(b);
-  SmallPtrSet<Edge *, 16>::iterator begin = Edges.begin(), end = Edges.end();
+  SmallVector<Edge, 16> Edges = inequality_graph.getEdges(b);
+  SmallVector<Edge, 16>::iterator begin = Edges.begin(), end = Edges.end();
 
   for (; begin != end ; ++begin) {
     if (((res >= Reduced) && (meet == max)) ||
        ((res == False) && (meet == min))) {
       break;
     }
-    Edge *in = *begin;
-    if (in->isUpperBound() == bound->isUpperBound()) {
-      Value *succ = in->getVertex();
-      res = meet(res, prove(a, succ, new Bound(bound, in->getValue()),
-                 level+1));
+    const Edge &in = *begin;
+    if (in.isUpperBound() == bound.isUpperBound()) {
+      Value *succ = in.getVertex();
+      res = meet(res, prove(a, succ, Bound(bound, in.getValue()),
+                            level+1));
     }
   }
 
@@ -941,53 +938,53 @@ void ABCD::updateMemDistance(Value *a, Value *b, Bound *bound, unsigned level,
 }
 
 /// Return the stored result for this bound
-ABCD::ProveResult ABCD::MemoizedResultChart::getResult(const Bound *bound)const{
-  if (max_false && Bound::leq(bound, max_false))
+ABCD::ProveResult ABCD::MemoizedResultChart::getResult(const Bound &bound)const{
+  if (max_false && Bound::leq(bound, *max_false))
     return False;
-  if (min_true && Bound::leq(min_true, bound))
+  if (min_true && Bound::leq(*min_true, bound))
     return True;
-  if (min_reduced && Bound::leq(min_reduced, bound))
+  if (min_reduced && Bound::leq(*min_reduced, bound))
     return Reduced;
   return False;
 }
 
 /// Stores a false found
-void ABCD::MemoizedResultChart::addFalse(Bound *bound) {
-  if (!max_false || Bound::leq(max_false, bound))
-    max_false = bound;
+void ABCD::MemoizedResultChart::addFalse(const Bound &bound) {
+  if (!max_false || Bound::leq(*max_false, bound))
+    max_false.reset(new Bound(bound));
 
-  if (Bound::eq(max_false, min_reduced))
-    min_reduced = Bound::createIncrement(min_reduced);
-  if (Bound::eq(max_false, min_true))
-    min_true = Bound::createIncrement(min_true);
-  if (Bound::eq(min_reduced, min_true))
-    min_reduced = NULL;
+  if (Bound::eq(max_false.get(), min_reduced.get()))
+    min_reduced.reset(new Bound(Bound::createIncrement(*min_reduced)));
+  if (Bound::eq(max_false.get(), min_true.get()))
+    min_true.reset(new Bound(Bound::createIncrement(*min_true)));
+  if (Bound::eq(min_reduced.get(), min_true.get()))
+    min_reduced.reset();
   clearRedundantReduced();
 }
 
 /// Stores a true found
-void ABCD::MemoizedResultChart::addTrue(Bound *bound) {
-  if (!min_true || Bound::leq(bound, min_true))
-    min_true = bound;
+void ABCD::MemoizedResultChart::addTrue(const Bound &bound) {
+  if (!min_true || Bound::leq(bound, *min_true))
+    min_true.reset(new Bound(bound));
 
-  if (Bound::eq(min_true, min_reduced))
-    min_reduced = Bound::createDecrement(min_reduced);
-  if (Bound::eq(min_true, max_false))
-    max_false = Bound::createDecrement(max_false);
-  if (Bound::eq(max_false, min_reduced))
-    min_reduced = NULL;
+  if (Bound::eq(min_true.get(), min_reduced.get()))
+    min_reduced.reset(new Bound(Bound::createDecrement(*min_reduced)));
+  if (Bound::eq(min_true.get(), max_false.get()))
+    max_false.reset(new Bound(Bound::createDecrement(*max_false)));
+  if (Bound::eq(max_false.get(), min_reduced.get()))
+    min_reduced.reset();
   clearRedundantReduced();
 }
 
 /// Stores a Reduced found
-void ABCD::MemoizedResultChart::addReduced(Bound *bound) {
-  if (!min_reduced || Bound::leq(bound, min_reduced))
-    min_reduced = bound;
+void ABCD::MemoizedResultChart::addReduced(const Bound &bound) {
+  if (!min_reduced || Bound::leq(bound, *min_reduced))
+    min_reduced.reset(new Bound(bound));
 
-  if (Bound::eq(min_reduced, min_true))
-    min_true = Bound::createIncrement(min_true);
-  if (Bound::eq(min_reduced, max_false))
-    max_false = Bound::createDecrement(max_false);
+  if (Bound::eq(min_reduced.get(), min_true.get()))
+    min_true.reset(new Bound(Bound::createIncrement(*min_true)));
+  if (Bound::eq(min_reduced.get(), max_false.get()))
+    max_false.reset(new Bound(Bound::createDecrement(*max_false)));
 }
 
 /// Clears redundant reduced
@@ -995,14 +992,14 @@ void ABCD::MemoizedResultChart::addReduced(Bound *bound) {
 /// is unnecessary and then removed. It also works for min_reduced
 /// begin smaller than max_false.
 void ABCD::MemoizedResultChart::clearRedundantReduced() {
-  if (min_true && min_reduced && Bound::lt(min_true, min_reduced))
-    min_reduced = NULL;
-  if (max_false && min_reduced && Bound::lt(min_reduced, max_false))
-    min_reduced = NULL;
+  if (min_true && min_reduced && Bound::lt(*min_true, *min_reduced))
+    min_reduced.reset();
+  if (max_false && min_reduced && Bound::lt(*min_reduced, *max_false))
+    min_reduced.reset();
 }
 
 /// Stores the bound found
-void ABCD::MemoizedResult::updateBound(Value *b, Bound *bound,
+void ABCD::MemoizedResult::updateBound(Value *b, const Bound &bound,
                                        const ProveResult res) {
   if (res == False) {
     map[b].addFalse(bound);
@@ -1020,19 +1017,17 @@ void ABCD::InequalityGraph::addEdge(Value *V_to, Value *V_from,
   assert(cast<IntegerType>(V_from->getType())->getBitWidth() ==
          value.getBitWidth());
 
-  DenseMap<Value *, SmallPtrSet<Edge *, 16> >::iterator from;
-  from = addNode(V_from);
-  from->second.insert(new Edge(V_to, value, upper));
+  graph[V_from].push_back(Edge(V_to, value, upper));
 }
 
 /// Test if there is any edge from V in the upper direction
 bool ABCD::InequalityGraph::hasEdge(Value *V, bool upper) const {
-  SmallPtrSet<Edge *, 16> it = graph.lookup(V);
+  SmallVector<Edge, 16> it = graph.lookup(V);
 
-  SmallPtrSet<Edge *, 16>::iterator begin = it.begin();
-  SmallPtrSet<Edge *, 16>::iterator end = it.end();
+  SmallVector<Edge, 16>::iterator begin = it.begin();
+  SmallVector<Edge, 16>::iterator end = it.end();
   for (; begin != end; ++begin) {
-    if ((*begin)->isUpperBound() == upper) {
+    if (begin->isUpperBound() == upper) {
       return true;
     }
   }
@@ -1049,18 +1044,18 @@ void ABCD::InequalityGraph::printHeader(raw_ostream &OS, Function &F) const {
 
 /// Prints the body of the dot file
 void ABCD::InequalityGraph::printBody(raw_ostream &OS) const {
-  DenseMap<Value *, SmallPtrSet<Edge *, 16> >::const_iterator begin =
+  DenseMap<Value *, SmallVector<Edge, 16> >::const_iterator begin =
       graph.begin(), end = graph.end();
 
   for (; begin != end ; ++begin) {
-    SmallPtrSet<Edge *, 16>::iterator begin_par =
+    SmallVector<Edge, 16>::const_iterator begin_par =
         begin->second.begin(), end_par = begin->second.end();
     Value *source = begin->first;
 
     printVertex(OS, source);
 
     for (; begin_par != end_par ; ++begin_par) {
-      Edge *edge = *begin_par;
+      const Edge &edge = *begin_par;
       printEdge(OS, source, edge);
     }
   }
@@ -1079,10 +1074,10 @@ void ABCD::InequalityGraph::printVertex(raw_ostream &OS, Value *source) const {
 
 /// Prints the edge to the dot file
 void ABCD::InequalityGraph::printEdge(raw_ostream &OS, Value *source,
-                                      Edge *edge) const {
-  Value *dest = edge->getVertex();
-  APInt value = edge->getValue();
-  bool upper = edge->isUpperBound();
+                                      const Edge &edge) const {
+  Value *dest = edge.getVertex();
+  APInt value = edge.getValue();
+  bool upper = edge.isUpperBound();
 
   OS << "\"";
   printName(OS, source);
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 50c96304db09..93e9bfbe35ac 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -174,7 +174,7 @@ bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
   // don't mess around with them.
   BasicBlock::const_iterator BBI = BB->begin();
   while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
-    for (Value::use_const_iterator UI = PN->use_begin(), E = PN->use_end();
+    for (Value::const_use_iterator UI = PN->use_begin(), E = PN->use_end();
          UI != E; ++UI) {
       const Instruction *User = cast<Instruction>(*UI);
       if (User->getParent() != DestBB || !isa<PHINode>(User))
@@ -714,8 +714,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 
   MemoryInst->replaceUsesOfWith(Addr, SunkAddr);
 
-  if (Addr->use_empty())
+  if (Addr->use_empty()) {
     RecursivelyDeleteTriviallyDeadInstructions(Addr);
+    // This address is now available for reassignment, so erase the table entry;
+    // we don't want to match some completely different instruction.
+    SunkAddrs[Addr] = 0;
+  }
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index fcb802a72ae0..642d59da044f 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1004,18 +1004,18 @@ static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr,
   
   // If the load and store are to the exact same address, they should have been
   // a must alias.  AA must have gotten confused.
-  // FIXME: Study to see if/when this happens.
-  if (LoadOffset == StoreOffset) {
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
 #if 0
+  if (LoadOffset == StoreOffset) {
     dbgs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n"
     << "Base       = " << *StoreBase << "\n"
     << "Store Ptr  = " << *WritePtr << "\n"
     << "Store Offs = " << StoreOffset << "\n"
     << "Load Ptr   = " << *LoadPtr << "\n";
     abort();
-#endif
-    return -1;
   }
+#endif
   
   // If the load and store don't overlap at all, the store doesn't provide
   // anything to the load.  In this case, they really don't alias at all, AA
@@ -1031,11 +1031,11 @@ static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr,
   
   
   bool isAAFailure = false;
-  if (StoreOffset < LoadOffset) {
+  if (StoreOffset < LoadOffset)
     isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
-  } else {
+  else
     isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
-  }
+
   if (isAAFailure) {
 #if 0
     dbgs() << "STORE LOAD DEP WITH COMMON BASE:\n"
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index eb04d9401fbb..988a4cb3f2a2 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -553,22 +553,26 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     // New instructions were inserted at the end of the preheader.
     if (isa<PHINode>(I))
       break;
+
     // Don't move instructions which might have side effects, since the side
-    // effects need to complete before instructions inside the loop.  Also
-    // don't move instructions which might read memory, since the loop may
-    // modify memory. Note that it's okay if the instruction might have
-    // undefined behavior: LoopSimplify guarantees that the preheader
-    // dominates the exit block.
+    // effects need to complete before instructions inside the loop.  Also don't
+    // move instructions which might read memory, since the loop may modify
+    // memory. Note that it's okay if the instruction might have undefined
+    // behavior: LoopSimplify guarantees that the preheader dominates the exit
+    // block.
     if (I->mayHaveSideEffects() || I->mayReadFromMemory())
       continue;
+
     // Skip debug info intrinsics.
     if (isa<DbgInfoIntrinsic>(I))
       continue;
+
     // Don't sink static AllocaInsts out of the entry block, which would
     // turn them into dynamic allocas!
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
       if (AI->isStaticAlloca())
         continue;
+
     // Determine if there is a use in or before the loop (direct or
     // otherwise).
     bool UsedInLoop = false;
@@ -585,19 +589,29 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
         break;
       }
     }
+
     // If there is, the def must remain in the preheader.
     if (UsedInLoop)
       continue;
+
     // Otherwise, sink it to the exit block.
     Instruction *ToMove = I;
     bool Done = false;
-    if (I != Preheader->begin())
-      --I;
-    else
+
+    if (I != Preheader->begin()) {
+      // Skip debug info intrinsics.
+      do {
+        --I;
+      } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+      if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+        Done = true;
+    } else {
       Done = true;
+    }
+
     ToMove->moveBefore(InsertPt);
-    if (Done)
-      break;
+    if (Done) break;
     InsertPt = ToMove;
   }
 }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index f920dcab25df..625a75d6cca9 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1899,7 +1899,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
       const Value *V = U->getValue();
       if (const Instruction *Inst = dyn_cast<Instruction>(V))
         if (L->contains(Inst)) continue;
-      for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
+      for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
            UI != UE; ++UI) {
         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
         // Ignore non-instructions.
@@ -2827,6 +2827,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       IP = Tentative;
   }
   while (isa<PHINode>(IP)) ++IP;
+  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
 
   // Inform the Rewriter if we have a post-increment use, so that it can
   // perform an advantageous expansion.
@@ -2864,8 +2865,10 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
         // so that it is dominated by its operand. If the original insert point
         // was already dominated by the increment, keep it, because there may
         // be loop-variant operands that need to be respected also.
-        if (L->contains(LF.UserInst) && !DT.dominates(IVIncInsertPos, IP))
+        if (L->contains(LF.UserInst) && !DT.dominates(IVIncInsertPos, IP)) {
           IP = IVIncInsertPos;
+          while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+        }
         break;
       }
       Start = AR->getStart();
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 99e12522ce0c..7a6eec3d46b4 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -45,7 +45,7 @@ namespace {
 
    bool valueEscapes(const Instruction *Inst) const {
      const BasicBlock *BB = Inst->getParent();
-      for (Value::use_const_iterator UI = Inst->use_begin(),E = Inst->use_end();
+      for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end();
            UI != E; ++UI)
         if (cast<Instruction>(*UI)->getParent() != BB ||
             isa<PHINode>(*UI))
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 7e37938868d6..546b7b6cc292 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1705,28 +1705,31 @@ ModulePass *llvm::createIPSCCPPass() {
 }
 
 
-static bool AddressIsTaken(GlobalValue *GV) {
+static bool AddressIsTaken(const GlobalValue *GV) {
   // Delete any dead constantexpr klingons.
   GV->removeDeadConstantUsers();
 
-  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
-       UI != E; ++UI)
-    if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+  for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == GV || SI->isVolatile())
         return true;  // Storing addr of GV.
-    } else if (isa<InvokeInst>(*UI) || isa<CallInst>(*UI)) {
+    } else if (isa<InvokeInst>(U) || isa<CallInst>(U)) {
       // Make sure we are calling the function, not passing the address.
-      if (UI.getOperandNo() != 0)
+      ImmutableCallSite CS(cast<Instruction>(U));
+      if (!CS.isCallee(UI))
         return true;
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+    } else if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (LI->isVolatile())
         return true;
-    } else if (isa<BlockAddress>(*UI)) {
+    } else if (isa<BlockAddress>(U)) {
       // blockaddress doesn't take the address of the function, it takes addr
       // of label.
     } else {
       return true;
     }
+  }
   return false;
 }
 
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 738c5e8d13d1..b621e8d8aa6f 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -79,7 +79,7 @@ static void ChangeToUnreachable(Instruction *I) {
 /// ChangeToCall - Convert the specified invoke into a normal call.
 static void ChangeToCall(InvokeInst *II) {
   BasicBlock *BB = II->getParent();
-  SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args.begin(),
                                        Args.end(), "", II);
   NewCall->takeName(II);
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 22f3628cf7c0..5941ea6571b7 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -350,10 +350,16 @@ struct StrNCmpOpt : public LibCallOptimization {
 // 'strcpy' Optimizations
 
 struct StrCpyOpt : public LibCallOptimization {
+  bool OptChkCall;  // True if it's optimizing a __strcpy_chk libcall.
+
+  StrCpyOpt(bool c) : OptChkCall(c) {}
+
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strcpy" function prototype.
+    unsigned NumParams = OptChkCall ? 3 : 2;
     const FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+    if (FT->getNumParams() != NumParams ||
+        FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(*Context))
       return 0;
@@ -371,8 +377,13 @@ struct StrCpyOpt : public LibCallOptimization {
 
     // We have enough information to now generate the memcpy call to do the
     // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-    EmitMemCpy(Dst, Src,
-               ConstantInt::get(TD->getIntPtrType(*Context), Len), 1, B, TD);
+    if (OptChkCall)
+      EmitMemCpyChk(Dst, Src,
+                    ConstantInt::get(TD->getIntPtrType(*Context), Len),
+                    CI->getOperand(3), B, TD);
+    else
+      EmitMemCpy(Dst, Src,
+                 ConstantInt::get(TD->getIntPtrType(*Context), Len), 1, B, TD);
     return Dst;
   }
 };
@@ -1162,7 +1173,8 @@ namespace {
     StringMap<LibCallOptimization*> Optimizations;
     // String and Memory LibCall Optimizations
     StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrCmpOpt StrCmp;
-    StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrNCpyOpt StrNCpy; StrLenOpt StrLen;
+    StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk;
+    StrNCpyOpt StrNCpy; StrLenOpt StrLen;
     StrToOpt StrTo; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
@@ -1177,8 +1189,7 @@ namespace {
     bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(&ID) {}
-
+    SimplifyLibCalls() : FunctionPass(&ID), StrCpy(false), StrCpyChk(true) {}
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1228,6 +1239,9 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["memmove"] = &MemMove;
   Optimizations["memset"] = &MemSet;
 
+  // _chk variants of String and Memory LibCall Optimizations.
+  Optimizations["__strcpy_chk"] = &StrCpyChk;
+
   // Math Library Optimizations
   Optimizations["powf"] = &Pow;
   Optimizations["pow"] = &Pow;
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index be6b3834f273..c70bab5492ef 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -440,8 +440,9 @@ static bool FindAllMemoryUses(Instruction *I,
     }
     
     if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
-      if (UI.getOperandNo() == 0) return true; // Storing addr, not into addr.
-      MemoryUses.push_back(std::make_pair(SI, UI.getOperandNo()));
+      unsigned opNo = UI.getOperandNo();
+      if (opNo == 0) return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(SI, opNo));
       continue;
     }
     
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 36573906224d..8c25ad139b94 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -94,7 +94,7 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
   if (TI->getNumSuccessors() == 1) return false;
 
   const BasicBlock *Dest = TI->getSuccessor(SuccNum);
-  pred_const_iterator I = pred_begin(Dest), E = pred_end(Dest);
+  const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
 
   // If there is more than one predecessor, this is a critical edge...
   assert(I != E && "No preds, but we have an edge to the block?");
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index b44f01976044..0afccf42f637 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -108,7 +108,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
 
 
 /// EmitMemCpy - Emit a call to the memcpy function to the builder.  This always
-/// expects that the size has type 'intptr_t' and Dst/Src are pointers.
+/// expects that Len has type 'intptr_t' and Dst/Src are pointers.
 Value *llvm::EmitMemCpy(Value *Dst, Value *Src, Value *Len,
                         unsigned Align, IRBuilder<> &B, const TargetData *TD) {
   Module *M = B.GetInsertBlock()->getParent()->getParent();
@@ -120,10 +120,34 @@ Value *llvm::EmitMemCpy(Value *Dst, Value *Src, Value *Len,
                        ConstantInt::get(B.getInt32Ty(), Align));
 }
 
+/// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder.
+/// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
+/// are pointers.
+Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+                           IRBuilder<> &B, const TargetData *TD) {
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeWithIndex AWI;
+  AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
+                                         AttrListPtr::get(&AWI, 1),
+                                         B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(),
+                                         TD->getIntPtrType(Context),
+                                         TD->getIntPtrType(Context), NULL);
+  Dst = CastToCStr(Dst, B);
+  Src = CastToCStr(Src, B);
+  CallInst *CI = B.CreateCall4(MemCpy, Dst, Src, Len, ObjSize);
+  if (const Function *F = dyn_cast<Function>(MemCpy->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
 /// EmitMemMove - Emit a call to the memmove function to the builder.  This
 /// always expects that the size has type 'intptr_t' and Dst/Src are pointers.
 Value *llvm::EmitMemMove(Value *Dst, Value *Src, Value *Len,
-					               unsigned Align, IRBuilder<> &B, const TargetData *TD) {
+                         unsigned Align, IRBuilder<> &B, const TargetData *TD) {
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   const Type *Ty = TD->getIntPtrType(Context);
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index 766c4d99a8f9..bbbcc1adae74 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -226,7 +226,7 @@ bool LowerInvoke::insertCheapEHSupport(Function &F) {
   bool Changed = false;
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      std::vector<Value*> CallArgs(II->op_begin()+3, II->op_end());
+      std::vector<Value*> CallArgs(II->op_begin(), II->op_end() - 3);
       // Insert a normal call instruction...
       CallInst *NewCall = CallInst::Create(II->getCalledValue(),
                                            CallArgs.begin(), CallArgs.end(), "",II);
@@ -298,7 +298,7 @@ void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
   CatchSwitch->addCase(InvokeNoC, II->getUnwindDest());
 
   // Insert a normal call instruction.
-  std::vector<Value*> CallArgs(II->op_begin()+3, II->op_end());
+  std::vector<Value*> CallArgs(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(),
                                        CallArgs.begin(), CallArgs.end(), "",
                                        II);
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 4f5a70b3b258..f181f3a0a46f 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -68,7 +68,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
   // assignments to subsections of the memory unit.
 
   // Only allow direct and non-volatile loads and stores...
-  for (Value::use_const_iterator UI = AI->use_begin(), UE = AI->use_end();
+  for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
        UI != UE; ++UI)     // Loop over all of the uses of the alloca
     if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
       if (LI->isVolatile())
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index a31235a1f5cb..292332e4f8a6 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -14,31 +14,82 @@
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Instructions.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-typedef DenseMap<BasicBlock*, TrackingVH<Value> > AvailableValsTy;
-typedef std::vector<std::pair<BasicBlock*, TrackingVH<Value> > >
-                IncomingPredInfoTy;
+/// BBInfo - Per-basic block information used internally by SSAUpdater.
+/// The predecessors of each block are cached here since pred_iterator is
+/// slow and we need to iterate over the blocks at least a few times.
+class SSAUpdater::BBInfo {
+public:
+  Value *AvailableVal; // Value to use in this block.
+  BasicBlock *DefBB;   // Block that defines the available value.
+  unsigned NumPreds;   // Number of predecessor blocks.
+  BasicBlock **Preds;  // Array[NumPreds] of predecessor blocks.
+  unsigned Counter;    // Marker to identify blocks already visited.
+  PHINode *PHITag;     // Marker for existing PHIs that match.
 
+  BBInfo(BasicBlock *BB, Value *V, BumpPtrAllocator *Allocator);
+};
+typedef DenseMap<BasicBlock*, SSAUpdater::BBInfo*> BBMapTy;
+
+SSAUpdater::BBInfo::BBInfo(BasicBlock *BB, Value *V,
+                           BumpPtrAllocator *Allocator)
+  : AvailableVal(V), DefBB(0), NumPreds(0), Preds(0), Counter(0), PHITag(0) {
+  // If this block has a known value, don't bother finding its predecessors.
+  if (V) {
+    DefBB = BB;
+    return;
+  }
+
+  // We can get our predecessor info by walking the pred_iterator list, but it
+  // is relatively slow.  If we already have PHI nodes in this block, walk one
+  // of them to get the predecessor list instead.
+  if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
+    NumPreds = SomePhi->getNumIncomingValues();
+    Preds = static_cast<BasicBlock**>
+      (Allocator->Allocate(NumPreds * sizeof(BasicBlock*),
+                           AlignOf<BasicBlock*>::Alignment));
+    for (unsigned pi = 0; pi != NumPreds; ++pi)
+      Preds[pi] = SomePhi->getIncomingBlock(pi);
+    return;
+  }
+
+  // Stash the predecessors in a temporary vector until we know how much space
+  // to allocate for them.
+  SmallVector<BasicBlock*, 10> TmpPreds;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    TmpPreds.push_back(*PI);
+    ++NumPreds;
+  }
+  Preds = static_cast<BasicBlock**>
+    (Allocator->Allocate(NumPreds * sizeof(BasicBlock*),
+                         AlignOf<BasicBlock*>::Alignment));
+  memcpy(Preds, TmpPreds.data(), NumPreds * sizeof(BasicBlock*));
+}
+
+typedef DenseMap<BasicBlock*, Value*> AvailableValsTy;
 static AvailableValsTy &getAvailableVals(void *AV) {
   return *static_cast<AvailableValsTy*>(AV);
 }
 
-static IncomingPredInfoTy &getIncomingPredInfo(void *IPI) {
-  return *static_cast<IncomingPredInfoTy*>(IPI);
+static BBMapTy *getBBMap(void *BM) {
+  return static_cast<BBMapTy*>(BM);
 }
 
+static BumpPtrAllocator *getAllocator(void *BPA) {
+  return static_cast<BumpPtrAllocator*>(BPA);
+}
 
 SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
-  : AV(0), PrototypeValue(0), IPI(0), InsertedPHIs(NewPHI) {}
+  : AV(0), PrototypeValue(0), BM(0), BPA(0), InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
   delete &getAvailableVals(AV);
-  delete &getIncomingPredInfo(IPI);
 }
 
 /// Initialize - Reset this object to get ready for a new set of SSA
@@ -48,11 +99,6 @@ void SSAUpdater::Initialize(Value *ProtoValue) {
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
-
-  if (IPI == 0)
-    IPI = new IncomingPredInfoTy();
-  else
-    getIncomingPredInfo(IPI).clear();
   PrototypeValue = ProtoValue;
 }
 
@@ -73,7 +119,7 @@ void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
 
 /// IsEquivalentPHI - Check if PHI has the same incoming value as specified
 /// in ValueMapping for each predecessor block.
-static bool IsEquivalentPHI(PHINode *PHI, 
+static bool IsEquivalentPHI(PHINode *PHI,
                             DenseMap<BasicBlock*, Value*> &ValueMapping) {
   unsigned PHINumValues = PHI->getNumIncomingValues();
   if (PHINumValues != ValueMapping.size())
@@ -89,38 +135,12 @@ static bool IsEquivalentPHI(PHINode *PHI,
   return true;
 }
 
-/// GetExistingPHI - Check if BB already contains a phi node that is equivalent
-/// to the specified mapping from predecessor blocks to incoming values.
-static Value *GetExistingPHI(BasicBlock *BB,
-                             DenseMap<BasicBlock*, Value*> &ValueMapping) {
-  PHINode *SomePHI;
-  for (BasicBlock::iterator It = BB->begin();
-       (SomePHI = dyn_cast<PHINode>(It)); ++It) {
-    if (IsEquivalentPHI(SomePHI, ValueMapping))
-      return SomePHI;
-  }
-  return 0;
-}
-
-/// GetExistingPHI - Check if BB already contains an equivalent phi node.
-/// The InputIt type must be an iterator over std::pair<BasicBlock*, Value*>
-/// objects that specify the mapping from predecessor blocks to incoming values.
-template<typename InputIt>
-static Value *GetExistingPHI(BasicBlock *BB, const InputIt &I,
-                             const InputIt &E) {
-  // Avoid create the mapping if BB has no phi nodes at all.
-  if (!isa<PHINode>(BB->begin()))
-    return 0;
-  DenseMap<BasicBlock*, Value*> ValueMapping(I, E);
-  return GetExistingPHI(BB, ValueMapping);
-}
-
 /// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
 /// live at the end of the specified block.
 Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
-  assert(getIncomingPredInfo(IPI).empty() && "Unexpected Internal State");
+  assert(BM == 0 && BPA == 0 && "Unexpected Internal State");
   Value *Res = GetValueAtEndOfBlockInternal(BB);
-  assert(getIncomingPredInfo(IPI).empty() && "Unexpected Internal State");
+  assert(BM == 0 && BPA == 0 && "Unexpected Internal State");
   return Res;
 }
 
@@ -146,7 +166,7 @@ Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
 Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // If there is no definition of the renamed variable in this block, just use
   // GetValueAtEndOfBlock to do our work.
-  if (!getAvailableVals(AV).count(BB))
+  if (!HasValueForBlock(BB))
     return GetValueAtEndOfBlock(BB);
 
   // Otherwise, we have the hard case.  Get the live-in values for each
@@ -193,10 +213,18 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   if (SingularValue != 0)
     return SingularValue;
 
-  // Otherwise, we do need a PHI.
-  if (Value *ExistingPHI = GetExistingPHI(BB, PredValues.begin(),
-                                          PredValues.end()))
-    return ExistingPHI;
+  // Otherwise, we do need a PHI: check to see if we already have one available
+  // in this block that produces the right value.
+  if (isa<PHINode>(BB->begin())) {
+    DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(),
+                                               PredValues.end());
+    PHINode *SomePHI;
+    for (BasicBlock::iterator It = BB->begin();
+         (SomePHI = dyn_cast<PHINode>(It)); ++It) {
+      if (IsEquivalentPHI(SomePHI, ValueMapping))
+        return SomePHI;
+    }
+  }
 
   // Ok, we have no way out, insert a new one now.
   PHINode *InsertedPHI = PHINode::Create(PrototypeValue->getType(),
@@ -226,7 +254,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
 /// which use their value in the corresponding predecessor.
 void SSAUpdater::RewriteUse(Use &U) {
   Instruction *User = cast<Instruction>(U.getUser());
-  
+
   Value *V;
   if (PHINode *UserPN = dyn_cast<PHINode>(User))
     V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
@@ -236,161 +264,264 @@ void SSAUpdater::RewriteUse(Use &U) {
   U.set(V);
 }
 
-
 /// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
 /// for the specified BB and if so, return it.  If not, construct SSA form by
-/// walking predecessors inserting PHI nodes as needed until we get to a block
-/// where the value is available.
-///
+/// first calculating the required placement of PHIs and then inserting new
+/// PHIs where needed.
 Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  if (Value *V = AvailableVals[BB])
+    return V;
 
-  // Query AvailableVals by doing an insertion of null.
-  std::pair<AvailableValsTy::iterator, bool> InsertRes =
-    AvailableVals.insert(std::make_pair(BB, TrackingVH<Value>()));
+  // Pool allocation used internally by GetValueAtEndOfBlock.
+  BumpPtrAllocator AllocatorObj;
+  BBMapTy BBMapObj;
+  BPA = &AllocatorObj;
+  BM = &BBMapObj;
 
-  // Handle the case when the insertion fails because we have already seen BB.
-  if (!InsertRes.second) {
-    // If the insertion failed, there are two cases.  The first case is that the
-    // value is already available for the specified block.  If we get this, just
-    // return the value.
-    if (InsertRes.first->second != 0)
-      return InsertRes.first->second;
+  BBInfo *Info = new (AllocatorObj) BBInfo(BB, 0, &AllocatorObj);
+  BBMapObj[BB] = Info;
 
-    // Otherwise, if the value we find is null, then this is the value is not
-    // known but it is being computed elsewhere in our recursion.  This means
-    // that we have a cycle.  Handle this by inserting a PHI node and returning
-    // it.  When we get back to the first instance of the recursion we will fill
-    // in the PHI node.
-    return InsertRes.first->second =
-      PHINode::Create(PrototypeValue->getType(), PrototypeValue->getName(),
-                      &BB->front());
+  bool Changed;
+  unsigned Counter = 1;
+  do {
+    Changed = false;
+    FindPHIPlacement(BB, Info, Changed, Counter);
+    ++Counter;
+  } while (Changed);
+
+  FindAvailableVal(BB, Info, Counter);
+
+  BPA = 0;
+  BM = 0;
+  return Info->AvailableVal;
+}
+
+/// FindPHIPlacement - Recursively visit the predecessors of a block to find
+/// the reaching definition for each predecessor and then determine whether
+/// a PHI is needed in this block.
+void SSAUpdater::FindPHIPlacement(BasicBlock *BB, BBInfo *Info, bool &Changed,
+                                  unsigned Counter) {
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  BBMapTy *BBMap = getBBMap(BM);
+  BumpPtrAllocator *Allocator = getAllocator(BPA);
+  bool BBNeedsPHI = false;
+  BasicBlock *SamePredDefBB = 0;
+
+  // If there are no predecessors, then we must have found an unreachable
+  // block.  Treat it as a definition with 'undef'.
+  if (Info->NumPreds == 0) {
+    Info->AvailableVal = UndefValue::get(PrototypeValue->getType());
+    Info->DefBB = BB;
+    return;
   }
 
-  // Okay, the value isn't in the map and we just inserted a null in the entry
-  // to indicate that we're processing the block.  Since we have no idea what
-  // value is in this block, we have to recurse through our predecessors.
-  //
-  // While we're walking our predecessors, we keep track of them in a vector,
-  // then insert a PHI node in the end if we actually need one.  We could use a
-  // smallvector here, but that would take a lot of stack space for every level
-  // of the recursion, just use IncomingPredInfo as an explicit stack.
-  IncomingPredInfoTy &IncomingPredInfo = getIncomingPredInfo(IPI);
-  unsigned FirstPredInfoEntry = IncomingPredInfo.size();
-
-  // As we're walking the predecessors, keep track of whether they are all
-  // producing the same value.  If so, this value will capture it, if not, it
-  // will get reset to null.  We distinguish the no-predecessor case explicitly
-  // below.
-  TrackingVH<Value> ExistingValue;
-
-  // We can get our predecessor info by walking the pred_iterator list, but it
-  // is relatively slow.  If we already have PHI nodes in this block, walk one
-  // of them to get the predecessor list instead.
-  if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
-    for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
-      BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
-      Value *PredVal = GetValueAtEndOfBlockInternal(PredBB);
-      IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal));
-
-      // Set ExistingValue to singular value from all predecessors so far.
-      if (i == 0)
-        ExistingValue = PredVal;
-      else if (PredVal != ExistingValue)
-        ExistingValue = 0;
+  Info->Counter = Counter;
+  for (unsigned pi = 0; pi != Info->NumPreds; ++pi) {
+    BasicBlock *Pred = Info->Preds[pi];
+    BBMapTy::value_type &BBMapBucket = BBMap->FindAndConstruct(Pred);
+    if (!BBMapBucket.second) {
+      Value *PredVal = AvailableVals.lookup(Pred);
+      BBMapBucket.second = new (*Allocator) BBInfo(Pred, PredVal, Allocator);
     }
-  } else {
-    bool isFirstPred = true;
-    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-      BasicBlock *PredBB = *PI;
-      Value *PredVal = GetValueAtEndOfBlockInternal(PredBB);
-      IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal));
+    BBInfo *PredInfo = BBMapBucket.second;
+    BasicBlock *DefBB = 0;
+    if (!PredInfo->AvailableVal) {
+      if (PredInfo->Counter != Counter)
+        FindPHIPlacement(Pred, PredInfo, Changed, Counter);
 
-      // Set ExistingValue to singular value from all predecessors so far.
-      if (isFirstPred) {
-        ExistingValue = PredVal;
-        isFirstPred = false;
-      } else if (PredVal != ExistingValue)
-        ExistingValue = 0;
+      // Ignore back edges where the value is not yet known.
+      if (!PredInfo->DefBB)
+        continue;
+    }
+    DefBB = PredInfo->DefBB;
+
+    if (!SamePredDefBB)
+      SamePredDefBB = DefBB;
+    else if (DefBB != SamePredDefBB)
+      BBNeedsPHI = true;
+  }
+
+  BasicBlock *NewDefBB = (BBNeedsPHI ? BB : SamePredDefBB);
+  if (Info->DefBB != NewDefBB) {
+    Changed = true;
+    Info->DefBB = NewDefBB;
+  }
+}
+
+/// FindAvailableVal - If this block requires a PHI, first check if an existing
+/// PHI matches the PHI placement and reaching definitions computed earlier,
+/// and if not, create a new PHI.  Visit all the block's predecessors to
+/// calculate the available value for each one and fill in the incoming values
+/// for a new PHI.
+void SSAUpdater::FindAvailableVal(BasicBlock *BB, BBInfo *Info,
+                                  unsigned Counter) {
+  if (Info->AvailableVal || Info->Counter == Counter)
+    return;
+
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  BBMapTy *BBMap = getBBMap(BM);
+
+  // Check if there needs to be a PHI in BB.
+  PHINode *NewPHI = 0;
+  if (Info->DefBB == BB) {
+    // Look for an existing PHI.
+    FindExistingPHI(BB);
+    if (!Info->AvailableVal) {
+      NewPHI = PHINode::Create(PrototypeValue->getType(),
+                               PrototypeValue->getName(), &BB->front());
+      NewPHI->reserveOperandSpace(Info->NumPreds);
+      Info->AvailableVal = NewPHI;
+      AvailableVals[BB] = NewPHI;
     }
   }
 
-  // If there are no predecessors, then we must have found an unreachable block
-  // just return 'undef'.  Since there are no predecessors, InsertRes must not
-  // be invalidated.
-  if (IncomingPredInfo.size() == FirstPredInfoEntry)
-    return InsertRes.first->second = UndefValue::get(PrototypeValue->getType());
-
-  /// Look up BB's entry in AvailableVals.  'InsertRes' may be invalidated.  If
-  /// this block is involved in a loop, a no-entry PHI node will have been
-  /// inserted as InsertedVal.  Otherwise, we'll still have the null we inserted
-  /// above.
-  TrackingVH<Value> &InsertedVal = AvailableVals[BB];
-
-  // If the predecessor values are not all the same, then check to see if there
-  // is an existing PHI that can be used.
-  if (!ExistingValue)
-    ExistingValue = GetExistingPHI(BB,
-                                   IncomingPredInfo.begin()+FirstPredInfoEntry,
-                                   IncomingPredInfo.end());
-
-  // If there is an existing value we can use, then we don't need to insert a
-  // PHI.  This is the simple and common case.
-  if (ExistingValue) {
-    // If a PHI node got inserted, replace it with the existing value and delete
-    // it.
-    if (InsertedVal) {
-      PHINode *OldVal = cast<PHINode>(InsertedVal);
-      // Be careful about dead loops.  These RAUW's also update InsertedVal.
-      if (InsertedVal != ExistingValue)
-        OldVal->replaceAllUsesWith(ExistingValue);
-      else
-        OldVal->replaceAllUsesWith(UndefValue::get(InsertedVal->getType()));
-      OldVal->eraseFromParent();
-    } else {
-      InsertedVal = ExistingValue;
-    }
-
-    // Either path through the 'if' should have set InsertedVal -> ExistingVal.
-    assert((InsertedVal == ExistingValue || isa<UndefValue>(InsertedVal)) &&
-           "RAUW didn't change InsertedVal to be ExistingValue");
-
-    // Drop the entries we added in IncomingPredInfo to restore the stack.
-    IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
-                           IncomingPredInfo.end());
-    return ExistingValue;
+  // Iterate through the block's predecessors.
+  Info->Counter = Counter;
+  for (unsigned pi = 0; pi != Info->NumPreds; ++pi) {
+    BasicBlock *Pred = Info->Preds[pi];
+    BBInfo *PredInfo = (*BBMap)[Pred];
+    FindAvailableVal(Pred, PredInfo, Counter);
+    if (NewPHI) {
+      // Skip to the nearest preceding definition.
+      if (PredInfo->DefBB != Pred)
+        PredInfo = (*BBMap)[PredInfo->DefBB];
+      NewPHI->addIncoming(PredInfo->AvailableVal, Pred);
+    } else if (!Info->AvailableVal)
+      Info->AvailableVal = PredInfo->AvailableVal;
   }
 
-  // Otherwise, we do need a PHI: insert one now if we don't already have one.
-  if (InsertedVal == 0)
-    InsertedVal = PHINode::Create(PrototypeValue->getType(),
-                                  PrototypeValue->getName(), &BB->front());
-
-  PHINode *InsertedPHI = cast<PHINode>(InsertedVal);
-  InsertedPHI->reserveOperandSpace(IncomingPredInfo.size()-FirstPredInfoEntry);
-
-  // Fill in all the predecessors of the PHI.
-  for (IncomingPredInfoTy::iterator I =
-         IncomingPredInfo.begin()+FirstPredInfoEntry,
-       E = IncomingPredInfo.end(); I != E; ++I)
-    InsertedPHI->addIncoming(I->second, I->first);
-
-  // Drop the entries we added in IncomingPredInfo to restore the stack.
-  IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
-                         IncomingPredInfo.end());
-
-  // See if the PHI node can be merged to a single value.  This can happen in
-  // loop cases when we get a PHI of itself and one other value.
-  if (Value *ConstVal = InsertedPHI->hasConstantValue()) {
-    InsertedPHI->replaceAllUsesWith(ConstVal);
-    InsertedPHI->eraseFromParent();
-    InsertedVal = ConstVal;
-  } else {
-    DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  if (NewPHI) {
+    DEBUG(dbgs() << "  Inserted PHI: " << *NewPHI << "\n");
 
     // If the client wants to know about all new instructions, tell it.
-    if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+    if (InsertedPHIs) InsertedPHIs->push_back(NewPHI);
+  }
+}
+
+/// FindExistingPHI - Look through the PHI nodes in a block to see if any of
+/// them match what is needed.
+void SSAUpdater::FindExistingPHI(BasicBlock *BB) {
+  PHINode *SomePHI;
+  for (BasicBlock::iterator It = BB->begin();
+       (SomePHI = dyn_cast<PHINode>(It)); ++It) {
+    if (CheckIfPHIMatches(SomePHI)) {
+      RecordMatchingPHI(SomePHI);
+      break;
+    }
+    ClearPHITags(SomePHI);
+  }
+}
+
+/// CheckIfPHIMatches - Check if a PHI node matches the placement and values
+/// in the BBMap.
+bool SSAUpdater::CheckIfPHIMatches(PHINode *PHI) {
+  BBMapTy *BBMap = getBBMap(BM);
+  SmallVector<PHINode*, 20> WorkList;
+  WorkList.push_back(PHI);
+
+  // Mark that the block containing this PHI has been visited.
+  (*BBMap)[PHI->getParent()]->PHITag = PHI;
+
+  while (!WorkList.empty()) {
+    PHI = WorkList.pop_back_val();
+
+    // Iterate through the PHI's incoming values.
+    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+      Value *IncomingVal = PHI->getIncomingValue(i);
+      BasicBlock *Pred = PHI->getIncomingBlock(i);
+      BBInfo *PredInfo = (*BBMap)[Pred];
+      // Skip to the nearest preceding definition.
+      if (PredInfo->DefBB != Pred) {
+        Pred = PredInfo->DefBB;
+        PredInfo = (*BBMap)[Pred];
+      }
+
+      // Check if it matches the expected value.
+      if (PredInfo->AvailableVal) {
+        if (IncomingVal == PredInfo->AvailableVal)
+          continue;
+        return false;
+      }
+
+      // Check if the value is a PHI in the correct block.
+      PHINode *IncomingPHIVal = dyn_cast<PHINode>(IncomingVal);
+      if (!IncomingPHIVal || IncomingPHIVal->getParent() != Pred)
+        return false;
+
+      // If this block has already been visited, check if this PHI matches.
+      if (PredInfo->PHITag) {
+        if (IncomingPHIVal == PredInfo->PHITag)
+          continue;
+        return false;
+      }
+      PredInfo->PHITag = IncomingPHIVal;
+
+      WorkList.push_back(IncomingPHIVal);
+    }
+  }
+  return true;
+}
+
+/// RecordMatchingPHI - For a PHI node that matches, record it and its input
+/// PHIs in both the BBMap and the AvailableVals mapping.
+void SSAUpdater::RecordMatchingPHI(PHINode *PHI) {
+  BBMapTy *BBMap = getBBMap(BM);
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  SmallVector<PHINode*, 20> WorkList;
+  WorkList.push_back(PHI);
+
+  // Record this PHI.
+  BasicBlock *BB = PHI->getParent();
+  AvailableVals[BB] = PHI;
+  (*BBMap)[BB]->AvailableVal = PHI;
+
+  while (!WorkList.empty()) {
+    PHI = WorkList.pop_back_val();
+
+    // Iterate through the PHI's incoming values.
+    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+      PHINode *IncomingPHIVal = dyn_cast<PHINode>(PHI->getIncomingValue(i));
+      if (!IncomingPHIVal) continue;
+      BB = IncomingPHIVal->getParent();
+      BBInfo *Info = (*BBMap)[BB];
+      if (!Info || Info->AvailableVal)
+        continue;
+
+      // Record the PHI and add it to the worklist.
+      AvailableVals[BB] = IncomingPHIVal;
+      Info->AvailableVal = IncomingPHIVal;
+      WorkList.push_back(IncomingPHIVal);
+    }
+  }
+}
+
+/// ClearPHITags - When one of the existing PHI nodes fails to match, clear
+/// the PHITag values that were stored in the BBMap when checking to see if
+/// it matched.
+void SSAUpdater::ClearPHITags(PHINode *PHI) {
+  BBMapTy *BBMap = getBBMap(BM);
+  SmallVector<PHINode*, 20> WorkList;
+  WorkList.push_back(PHI);
+
+  // Clear the tag for this PHI.
+  (*BBMap)[PHI->getParent()]->PHITag = 0;
+
+  while (!WorkList.empty()) {
+    PHI = WorkList.pop_back_val();
+
+    // Iterate through the PHI's incoming values.
+    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+      PHINode *IncomingPHIVal = dyn_cast<PHINode>(PHI->getIncomingValue(i));
+      if (!IncomingPHIVal) continue;
+      BasicBlock *BB = IncomingPHIVal->getParent();
+      BBInfo *Info = (*BBMap)[BB];
+      if (!Info || Info->AvailableVal || !Info->PHITag)
+        continue;
+
+      // Clear the tag and add the PHI to the worklist.
+      Info->PHITag = 0;
+      WorkList.push_back(IncomingPHIVal);
+    }
   }
-
-  return InsertedVal;
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 2ce5bdcd61d4..9f2209dfcfc0 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -224,7 +224,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
     if (BI->isUnconditional() && BI->getSuccessor(0) == BB) {
       if (!AggressiveInsts) return false;
       // Okay, it looks like the instruction IS in the "condition".  Check to
-      // see if its a cheap instruction to unconditionally compute, and if it
+      // see if it's a cheap instruction to unconditionally compute, and if it
       // only uses stuff defined outside of the condition.  If so, hoist it out.
       if (!I->isSafeToSpeculativelyExecute())
         return false;
@@ -1768,7 +1768,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
           Pred->getInstList().remove(II);   // Take out of symbol table
 
           // Insert the call now.
-          SmallVector<Value*,8> Args(II->op_begin()+3, II->op_end());
+          SmallVector<Value*,8> Args(II->op_begin(), II->op_end()-3);
           CallInst *CI = CallInst::Create(II->getCalledValue(),
                                           Args.begin(), Args.end(),
                                           II->getName(), BI);
@@ -1970,13 +1970,13 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
             II->removeFromParent();   // Take out of symbol table
 
             // Insert the call now...
-            SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+            SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
             CallInst *CI = CallInst::Create(II->getCalledValue(),
                                             Args.begin(), Args.end(),
                                             II->getName(), BI);
             CI->setCallingConv(II->getCallingConv());
             CI->setAttributes(II->getAttributes());
-            // If the invoke produced a value, the Call does now instead.
+            // If the invoke produced a value, the call does now instead.
             II->replaceAllUsesWith(CI);
             delete II;
             Changed = true;
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 0eb9f020dc9a..f6a6076df7bc 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -1681,7 +1681,7 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
     // Output predecessors for the block...
     Out.PadToColumn(50);
     Out << ";";
-    pred_const_iterator PI = pred_begin(BB), PE = pred_end(BB);
+    const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
 
     if (PI == PE) {
       Out << " No predecessors!";
@@ -1875,6 +1875,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     if (PAL.getFnAttributes() != Attribute::None)
       Out << ' ' << Attribute::getAsString(PAL.getFnAttributes());
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+    Operand = II->getCalledValue();
     const PointerType    *PTy = cast<PointerType>(Operand->getType());
     const FunctionType   *FTy = cast<FunctionType>(PTy->getElementType());
     const Type         *RetTy = FTy->getReturnType();
@@ -1912,10 +1913,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       writeOperand(Operand, true);
     }
     Out << '(';
-    for (unsigned op = 3, Eop = I.getNumOperands(); op < Eop; ++op) {
-      if (op > 3)
+    for (unsigned op = 0, Eop = I.getNumOperands() - 3; op < Eop; ++op) {
+      if (op)
         Out << ", ";
-      writeParamOperand(I.getOperand(op), PAL.getParamAttributes(op-2));
+      writeParamOperand(I.getOperand(op), PAL.getParamAttributes(op + 1));
     }
 
     Out << ')';
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 5e4c9fb76624..b9aa5c34675b 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -225,7 +225,12 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       // Calls to these intrinsics are transformed into ShuffleVector's.
       NewFn = 0;
       return true;
+    } else if (Name.compare(5, 16, "x86.sse41.pmulld", 16) == 0) {
+      // Calls to these intrinsics are transformed into vector multiplies.
+      NewFn = 0;
+      return true;
     }
+    
 
     break;
   }
@@ -355,6 +360,18 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       
       //  Clean up the old call now that it has been completely upgraded.
       CI->eraseFromParent();
+    } else if (F->getName() == "llvm.x86.sse41.pmulld") {
+      // Upgrade this set of intrinsics into vector multiplies.
+      Instruction *Mul = BinaryOperator::CreateMul(CI->getOperand(1),
+                                                   CI->getOperand(2),
+                                                   CI->getName(),
+                                                   CI);
+      // Fix up all the uses with our new multiply.
+      if (!CI->use_empty())
+        CI->replaceAllUsesWith(Mul);
+        
+      // Remove upgraded multiply.
+      CI->eraseFromParent();
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index 4b80e36530bd..c64564b8a1e7 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
+  DebugLoc.cpp
   Dominators.cpp
   Function.cpp
   GVMaterializer.cpp
@@ -16,6 +17,7 @@ add_llvm_library(LLVMCore
   Instructions.cpp
   IntrinsicInst.cpp
   LLVMContext.cpp
+  LLVMContextImpl.cpp
   LeakDetector.cpp
   Metadata.cpp
   Module.cpp
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 10f88794f5a2..1553bd51342d 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -59,6 +59,7 @@ Constant *Constant::getNullValue(const Type *Ty) {
   case Type::PointerTyID:
     return ConstantPointerNull::get(cast<PointerType>(Ty));
   case Type::StructTyID:
+  case Type::UnionTyID:
   case Type::ArrayTyID:
   case Type::VectorTyID:
     return ConstantAggregateZero::get(Ty);
@@ -160,7 +161,7 @@ bool Constant::canTrap() const {
 /// isConstantUsed - Return true if the constant has users other than constant
 /// exprs and other dangling things.
 bool Constant::isConstantUsed() const {
-  for (use_const_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+  for (const_use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
     const Constant *UC = dyn_cast<Constant>(*UI);
     if (UC == 0 || isa<GlobalValue>(UC))
       return true;
@@ -944,7 +945,8 @@ bool ConstantFP::isValueValidForType(const Type *Ty, const APFloat& Val) {
 //                      Factory Function Implementation
 
 ConstantAggregateZero* ConstantAggregateZero::get(const Type* Ty) {
-  assert((Ty->isStructTy() || Ty->isArrayTy() || Ty->isVectorTy()) &&
+  assert((Ty->isStructTy() || Ty->isUnionTy()
+         || Ty->isArrayTy() || Ty->isVectorTy()) &&
          "Cannot create an aggregate zero of non-aggregate type!");
   
   LLVMContextImpl *pImpl = Ty->getContext().pImpl;
@@ -1945,6 +1947,20 @@ const char *ConstantExpr::getOpcodeName() const {
   return Instruction::getOpcodeName(getOpcode());
 }
 
+
+
+GetElementPtrConstantExpr::
+GetElementPtrConstantExpr(Constant *C, const std::vector<Constant*> &IdxList,
+                          const Type *DestTy)
+  : ConstantExpr(DestTy, Instruction::GetElementPtr,
+                 OperandTraits<GetElementPtrConstantExpr>::op_end(this)
+                 - (IdxList.size()+1), IdxList.size()+1) {
+  OperandList[0] = C;
+  for (unsigned i = 0, E = IdxList.size(); i != E; ++i)
+    OperandList[i+1] = IdxList[i];
+}
+
+
 //===----------------------------------------------------------------------===//
 //                replaceUsesOfWithOnConstant implementations
 
diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index c798ba2664cf..2f2fac53f062 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CONSTANTSCONTEXT_H
 #define LLVM_CONSTANTSCONTEXT_H
 
+#include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
 #include "llvm/Operator.h"
 #include "llvm/Support/Debug.h"
@@ -327,6 +328,39 @@ struct ExprMapKeyType {
   }
 };
 
+struct InlineAsmKeyType {
+  InlineAsmKeyType(StringRef AsmString,
+                   StringRef Constraints, bool hasSideEffects,
+                   bool isAlignStack)
+    : asm_string(AsmString), constraints(Constraints),
+      has_side_effects(hasSideEffects), is_align_stack(isAlignStack) {}
+  std::string asm_string;
+  std::string constraints;
+  bool has_side_effects;
+  bool is_align_stack;
+  bool operator==(const InlineAsmKeyType& that) const {
+    return this->asm_string == that.asm_string &&
+           this->constraints == that.constraints &&
+           this->has_side_effects == that.has_side_effects &&
+           this->is_align_stack == that.is_align_stack;
+  }
+  bool operator<(const InlineAsmKeyType& that) const {
+    if (this->asm_string != that.asm_string)
+      return this->asm_string < that.asm_string;
+    if (this->constraints != that.constraints)
+      return this->constraints < that.constraints;
+    if (this->has_side_effects != that.has_side_effects)
+      return this->has_side_effects < that.has_side_effects;
+    if (this->is_align_stack != that.is_align_stack)
+      return this->is_align_stack < that.is_align_stack;
+    return false;
+  }
+
+  bool operator!=(const InlineAsmKeyType& that) const {
+    return !(*this == that);
+  }
+};
+
 // The number of operands for each ConstantCreator::create method is
 // determined by the ConstantTraits template.
 // ConstantCreator - A class that is used to create constants by
@@ -517,6 +551,23 @@ struct ConstantKeyData<UndefValue> {
   }
 };
 
+template<>
+struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType> {
+  static InlineAsm *create(const PointerType *Ty, const InlineAsmKeyType &Key) {
+    return new InlineAsm(Ty, Key.asm_string, Key.constraints,
+                         Key.has_side_effects, Key.is_align_stack);
+  }
+};
+
+template<>
+struct ConstantKeyData<InlineAsm> {
+  typedef InlineAsmKeyType ValType;
+  static ValType getValType(InlineAsm *Asm) {
+    return InlineAsmKeyType(Asm->getAsmString(), Asm->getConstraintString(),
+                            Asm->hasSideEffects(), Asm->isAlignStack());
+  }
+};
+
 template<class ValType, class TypeClass, class ConstantClass,
          bool HasLargeKey = false /*true for arrays and structs*/ >
 class ConstantUniqueMap : public AbstractTypeUser {
@@ -549,8 +600,8 @@ public:
   void freeConstants() {
     for (typename MapTy::iterator I=Map.begin(), E=Map.end();
          I != E; ++I) {
-      if (I->second->use_empty())
-        delete I->second;
+      // Asserts that use_empty().
+      delete I->second;
     }
   }
     
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index f4f65c54092b..44d487a8e2b4 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -1651,7 +1651,7 @@ LLVMBasicBlockRef LLVMGetInsertBlock(LLVMBuilderRef Builder) {
 }
 
 void LLVMClearInsertionPosition(LLVMBuilderRef Builder) {
-  unwrap(Builder)->ClearInsertionPoint ();
+  unwrap(Builder)->ClearInsertionPoint();
 }
 
 void LLVMInsertIntoBuilder(LLVMBuilderRef Builder, LLVMValueRef Instr) {
@@ -1670,11 +1670,13 @@ void LLVMDisposeBuilder(LLVMBuilderRef Builder) {
 /*--.. Metadata builders ...................................................--*/
 
 void LLVMSetCurrentDebugLocation(LLVMBuilderRef Builder, LLVMValueRef L) {
-  unwrap(Builder)->SetCurrentDebugLocation(L? unwrap<MDNode>(L) : NULL);
+  MDNode *Loc = L ? unwrap<MDNode>(L) : NULL;
+  unwrap(Builder)->SetCurrentDebugLocation(NewDebugLoc::getFromDILocation(Loc));
 }
 
 LLVMValueRef LLVMGetCurrentDebugLocation(LLVMBuilderRef Builder) {
-  return wrap(unwrap(Builder)->getCurrentDebugLocation());
+  return wrap(unwrap(Builder)->getCurrentDebugLocation()
+              .getAsMDNode(unwrap(Builder)->getContext()));
 }
 
 void LLVMSetInstDebugLocation(LLVMBuilderRef Builder, LLVMValueRef Inst) {
diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp
new file mode 100644
index 000000000000..f02ce57c3b04
--- /dev/null
+++ b/lib/VMCore/DebugLoc.cpp
@@ -0,0 +1,288 @@
+//===-- DebugLoc.cpp - Implement DebugLoc class ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DebugLoc.h"
+#include "LLVMContextImpl.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// DebugLoc Implementation
+//===----------------------------------------------------------------------===//
+
+MDNode *NewDebugLoc::getScope(const LLVMContext &Ctx) const {
+  if (ScopeIdx == 0) return 0;
+  
+  if (ScopeIdx > 0) {
+    // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
+    // position specified.
+    assert(unsigned(ScopeIdx) <= Ctx.pImpl->ScopeRecords.size() &&
+           "Invalid ScopeIdx!");
+    return Ctx.pImpl->ScopeRecords[ScopeIdx-1].get();
+  }
+  
+  // Otherwise, the index is in the ScopeInlinedAtRecords array.
+  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
+         "Invalid ScopeIdx");
+  return Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].first.get();
+}
+
+MDNode *NewDebugLoc::getInlinedAt(const LLVMContext &Ctx) const {
+  // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
+  // position specified.  Zero is invalid.
+  if (ScopeIdx >= 0) return 0;
+  
+  // Otherwise, the index is in the ScopeInlinedAtRecords array.
+  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
+         "Invalid ScopeIdx");
+  return Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].second.get();
+}
+
+/// Return both the Scope and the InlinedAt values.
+void NewDebugLoc::getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA,
+                                       const LLVMContext &Ctx) const {
+  if (ScopeIdx == 0) {
+    Scope = IA = 0;
+    return;
+  }
+  
+  if (ScopeIdx > 0) {
+    // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
+    // position specified.
+    assert(unsigned(ScopeIdx) <= Ctx.pImpl->ScopeRecords.size() &&
+           "Invalid ScopeIdx!");
+    Scope = Ctx.pImpl->ScopeRecords[ScopeIdx-1].get();
+    IA = 0;
+    return;
+  }
+  
+  // Otherwise, the index is in the ScopeInlinedAtRecords array.
+  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
+         "Invalid ScopeIdx");
+  Scope = Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].first.get();
+  IA    = Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].second.get();
+}
+
+
+NewDebugLoc NewDebugLoc::get(unsigned Line, unsigned Col,
+                             MDNode *Scope, MDNode *InlinedAt) {
+  NewDebugLoc Result;
+  
+  // If no scope is available, this is an unknown location.
+  if (Scope == 0) return Result;
+  
+  // Saturate line and col to "unknown".
+  if (Col > 255) Col = 0;
+  if (Line >= (1 << 24)) Line = 0;
+  Result.LineCol = Line | (Col << 24);
+  
+  LLVMContext &Ctx = Scope->getContext();
+  
+  // If there is no inlined-at location, use the ScopeRecords array.
+  if (InlinedAt == 0)
+    Result.ScopeIdx = Ctx.pImpl->getOrAddScopeRecordIdxEntry(Scope, 0);
+  else
+    Result.ScopeIdx = Ctx.pImpl->getOrAddScopeInlinedAtIdxEntry(Scope,
+                                                                InlinedAt, 0);
+
+  return Result;
+}
+
+/// getAsMDNode - This method converts the compressed DebugLoc node into a
+/// DILocation compatible MDNode.
+MDNode *NewDebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
+  if (isUnknown()) return 0;
+  
+  MDNode *Scope, *IA;
+  getScopeAndInlinedAt(Scope, IA, Ctx);
+  assert(Scope && "If scope is null, this should be isUnknown()");
+  
+  LLVMContext &Ctx2 = Scope->getContext();
+  const Type *Int32 = Type::getInt32Ty(Ctx2);
+  Value *Elts[] = {
+    ConstantInt::get(Int32, getLine()), ConstantInt::get(Int32, getCol()),
+    Scope, IA
+  };
+  return MDNode::get(Ctx2, &Elts[0], 4);
+}
+
+/// getFromDILocation - Translate the DILocation quad into a NewDebugLoc.
+NewDebugLoc NewDebugLoc::getFromDILocation(MDNode *N) {
+  if (N == 0 || N->getNumOperands() != 4) return NewDebugLoc();
+  
+  MDNode *Scope = dyn_cast_or_null<MDNode>(N->getOperand(2));
+  if (Scope == 0) return NewDebugLoc();
+  
+  unsigned LineNo = 0, ColNo = 0;
+  if (ConstantInt *Line = dyn_cast_or_null<ConstantInt>(N->getOperand(0)))
+    LineNo = Line->getZExtValue();
+  if (ConstantInt *Col = dyn_cast_or_null<ConstantInt>(N->getOperand(1)))
+    ColNo = Col->getZExtValue();
+  
+  return get(LineNo, ColNo, Scope, dyn_cast_or_null<MDNode>(N->getOperand(3)));
+}
+
+//===----------------------------------------------------------------------===//
+// LLVMContextImpl Implementation
+//===----------------------------------------------------------------------===//
+
+int LLVMContextImpl::getOrAddScopeRecordIdxEntry(MDNode *Scope,
+                                                 int ExistingIdx) {
+  // If we already have an entry for this scope, return it.
+  int &Idx = ScopeRecordIdx[Scope];
+  if (Idx) return Idx;
+  
+  // If we don't have an entry, but ExistingIdx is specified, use it.
+  if (ExistingIdx)
+    return Idx = ExistingIdx;
+  
+  // Otherwise add a new entry.
+  
+  // Start out ScopeRecords with a minimal reasonable size to avoid
+  // excessive reallocation starting out.
+  if (ScopeRecords.empty())
+    ScopeRecords.reserve(128);
+  
+  // Index is biased by 1 for index.
+  Idx = ScopeRecords.size()+1;
+  ScopeRecords.push_back(DebugRecVH(Scope, this, Idx));
+  return Idx;
+}
+
+int LLVMContextImpl::getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,
+                                                    int ExistingIdx) {
+  // If we already have an entry, return it.
+  int &Idx = ScopeInlinedAtIdx[std::make_pair(Scope, IA)];
+  if (Idx) return Idx;
+  
+  // If we don't have an entry, but ExistingIdx is specified, use it.
+  if (ExistingIdx)
+    return Idx = ExistingIdx;
+  
+  // Start out ScopeInlinedAtRecords with a minimal reasonable size to avoid
+  // excessive reallocation starting out.
+  if (ScopeInlinedAtRecords.empty())
+    ScopeInlinedAtRecords.reserve(128);
+    
+  // Index is biased by 1 and negated.
+  Idx = -ScopeInlinedAtRecords.size()-1;
+  ScopeInlinedAtRecords.push_back(std::make_pair(DebugRecVH(Scope, this, Idx),
+                                                 DebugRecVH(IA, this, Idx)));
+  return Idx;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DebugRecVH Implementation
+//===----------------------------------------------------------------------===//
+
+/// deleted - The MDNode this is pointing to got deleted, so this pointer needs
+/// to drop to null and we need remove our entry from the DenseMap.
+void DebugRecVH::deleted() {
+  // If this is a  non-canonical reference, just drop the value to null, we know
+  // it doesn't have a map entry.
+  if (Idx == 0) {
+    setValPtr(0);
+    return;
+  }
+    
+  MDNode *Cur = get();
+  
+  // If the index is positive, it is an entry in ScopeRecords.
+  if (Idx > 0) {
+    assert(Ctx->ScopeRecordIdx[Cur] == Idx && "Mapping out of date!");
+    Ctx->ScopeRecordIdx.erase(Cur);
+    // Reset this VH to null and we're done.
+    setValPtr(0);
+    Idx = 0;
+    return;
+  }
+  
+  // Otherwise, it is an entry in ScopeInlinedAtRecords, we don't know if it
+  // is the scope or the inlined-at record entry.
+  assert(unsigned(-Idx-1) < Ctx->ScopeInlinedAtRecords.size());
+  std::pair<DebugRecVH, DebugRecVH> &Entry = Ctx->ScopeInlinedAtRecords[-Idx-1];
+  assert((this == &Entry.first || this == &Entry.second) &&
+         "Mapping out of date!");
+  
+  MDNode *OldScope = Entry.first.get();
+  MDNode *OldInlinedAt = Entry.second.get();
+  assert(OldScope != 0 && OldInlinedAt != 0 &&
+         "Entry should be non-canonical if either val dropped to null");
+
+  // Otherwise, we do have an entry in it, nuke it and we're done.
+  assert(Ctx->ScopeInlinedAtIdx[std::make_pair(OldScope, OldInlinedAt)] == Idx&&
+         "Mapping out of date");
+  Ctx->ScopeInlinedAtIdx.erase(std::make_pair(OldScope, OldInlinedAt));
+  
+  // Reset this VH to null.  Drop both 'Idx' values to null to indicate that
+  // we're in non-canonical form now.
+  setValPtr(0);
+  Entry.first.Idx = Entry.second.Idx = 0;
+}
+
+void DebugRecVH::allUsesReplacedWith(Value *NewVa) {
+  // If being replaced with a non-mdnode value (e.g. undef) handle this as if
+  // the mdnode got deleted.
+  MDNode *NewVal = dyn_cast<MDNode>(NewVa);
+  if (NewVal == 0) return deleted();
+  
+  // If this is a non-canonical reference, just change it, we know it already
+  // doesn't have a map entry.
+  if (Idx == 0) {
+    setValPtr(NewVa);
+    return;
+  }
+  
+  MDNode *OldVal = get();
+  assert(OldVal != NewVa && "Node replaced with self?");
+  
+  // If the index is positive, it is an entry in ScopeRecords.
+  if (Idx > 0) {
+    assert(Ctx->ScopeRecordIdx[OldVal] == Idx && "Mapping out of date!");
+    Ctx->ScopeRecordIdx.erase(OldVal);
+    setValPtr(NewVal);
+
+    int NewEntry = Ctx->getOrAddScopeRecordIdxEntry(NewVal, Idx);
+    
+    // If NewVal already has an entry, this becomes a non-canonical reference,
+    // just drop Idx to 0 to signify this.
+    if (NewEntry != Idx)
+      Idx = 0;
+    return;
+  }
+  
+  // Otherwise, it is an entry in ScopeInlinedAtRecords, we don't know if it
+  // is the scope or the inlined-at record entry.
+  assert(unsigned(-Idx-1) < Ctx->ScopeInlinedAtRecords.size());
+  std::pair<DebugRecVH, DebugRecVH> &Entry = Ctx->ScopeInlinedAtRecords[-Idx-1];
+  assert((this == &Entry.first || this == &Entry.second) &&
+         "Mapping out of date!");
+  
+  MDNode *OldScope = Entry.first.get();
+  MDNode *OldInlinedAt = Entry.second.get();
+  assert(OldScope != 0 && OldInlinedAt != 0 &&
+         "Entry should be non-canonical if either val dropped to null");
+  
+  // Otherwise, we do have an entry in it, nuke it and we're done.
+  assert(Ctx->ScopeInlinedAtIdx[std::make_pair(OldScope, OldInlinedAt)] == Idx&&
+         "Mapping out of date");
+  Ctx->ScopeInlinedAtIdx.erase(std::make_pair(OldScope, OldInlinedAt));
+  
+  // Reset this VH to the new value.
+  setValPtr(NewVal);
+
+  int NewIdx = Ctx->getOrAddScopeInlinedAtIdxEntry(Entry.first.get(),
+                                                   Entry.second.get(), Idx);
+  // If NewVal already has an entry, this becomes a non-canonical reference,
+  // just drop Idx to 0 to signify this.
+  if (NewIdx != Idx) {
+    std::pair<DebugRecVH, DebugRecVH> &Entry=Ctx->ScopeInlinedAtRecords[-Idx-1];
+    Entry.first.Idx = Entry.second.Idx = 0;
+  }
+}
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index dbc283e49ac6..8f94efc6673a 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/LeakDetector.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/StringPool.h"
@@ -400,13 +401,16 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, const Type **Tys,
 #include "llvm/Intrinsics.gen"
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 
-  /// hasAddressTaken - returns true if there are any uses of this function
-  /// other than direct calls or invokes to it.
-bool Function::hasAddressTaken() const {
-  for (Value::use_const_iterator I = use_begin(), E = use_end(); I != E; ++I) {
-    if (I.getOperandNo() != 0 ||
-        (!isa<CallInst>(*I) && !isa<InvokeInst>(*I)))
-      return true;
+/// hasAddressTaken - returns true if there are any uses of this function
+/// other than direct calls or invokes to it.
+bool Function::hasAddressTaken(const User* *PutOffender) const {
+  for (Value::const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) {
+    const User *U = *I;
+    if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+      return PutOffender ? (*PutOffender = U, true) : true;
+    ImmutableCallSite CS(cast<Instruction>(U));
+    if (!CS.isCallee(I))
+      return PutOffender ? (*PutOffender = U, true) : true;
   }
   return false;
 }
diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp
index 489ec650e071..b758eb8702ae 100644
--- a/lib/VMCore/Globals.cpp
+++ b/lib/VMCore/Globals.cpp
@@ -61,8 +61,8 @@ void GlobalValue::Dematerialize() {
 /// that want to check to see if a global is unused, but don't want to deal
 /// with potentially dead constants hanging off of the globals.
 void GlobalValue::removeDeadConstantUsers() const {
-  Value::use_const_iterator I = use_begin(), E = use_end();
-  Value::use_const_iterator LastNonDeadUser = E;
+  Value::const_use_iterator I = use_begin(), E = use_end();
+  Value::const_use_iterator LastNonDeadUser = E;
   while (I != E) {
     if (const Constant *User = dyn_cast<Constant>(*I)) {
       if (!removeDeadUsersOfConstant(User)) {
diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp
index 9f2786e4e38d..c1b783c75210 100644
--- a/lib/VMCore/IRBuilder.cpp
+++ b/lib/VMCore/IRBuilder.cpp
@@ -32,19 +32,6 @@ Value *IRBuilderBase::CreateGlobalString(const char *Str, const Twine &Name) {
   return GV;
 }
 
-/// SetCurrentDebugLocation - Set location information used by debugging
-/// information.
-void IRBuilderBase::SetCurrentDebugLocation(MDNode *L) {
-  if (DbgMDKind == 0) 
-    DbgMDKind = Context.getMDKindID("dbg");
-  CurDbgLocation = L;
-}
-
-void IRBuilderBase::SetInstDebugLocation(Instruction *I) const {
-  if (CurDbgLocation)
-    I->setMetadata(DbgMDKind, CurDbgLocation);
-}
-
 const Type *IRBuilderBase::getCurrentFunctionReturnType() const {
   assert(BB && BB->getParent() && "No current function!");
   return BB->getParent()->getReturnType();
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index 6355834880bd..0d2eca9c3dea 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/InlineAsm.h"
+#include "ConstantsContext.h"
+#include "LLVMContextImpl.h"
 #include "llvm/DerivedTypes.h"
 #include <algorithm>
 #include <cctype>
@@ -23,28 +25,29 @@ InlineAsm::~InlineAsm() {
 }
 
 
-// NOTE: when memoizing the function type, we have to be careful to handle the
-// case when the type gets refined.
-
 InlineAsm *InlineAsm::get(const FunctionType *Ty, StringRef AsmString,
                           StringRef Constraints, bool hasSideEffects,
                           bool isAlignStack) {
-  // FIXME: memoize!
-  return new InlineAsm(Ty, AsmString, Constraints, hasSideEffects, 
-                       isAlignStack);
+  InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack);
+  LLVMContextImpl *pImpl = Ty->getContext().pImpl;
+  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(Ty), Key);
 }
 
-InlineAsm::InlineAsm(const FunctionType *Ty, StringRef asmString,
-                     StringRef constraints, bool hasSideEffects,
+InlineAsm::InlineAsm(const PointerType *Ty, const std::string &asmString,
+                     const std::string &constraints, bool hasSideEffects,
                      bool isAlignStack)
-  : Value(PointerType::getUnqual(Ty), 
-          Value::InlineAsmVal), 
+  : Value(Ty, Value::InlineAsmVal),
     AsmString(asmString), 
     Constraints(constraints), HasSideEffects(hasSideEffects), 
     IsAlignStack(isAlignStack) {
 
   // Do various checks on the constraint string and type.
-  assert(Verify(Ty, constraints) && "Function type not legal for constraints!");
+  assert(Verify(getFunctionType(), constraints) &&
+         "Function type not legal for constraints!");
+}
+
+void InlineAsm::destroyConstant() {
+  delete this;
 }
 
 const FunctionType *InlineAsm::getFunctionType() const {
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 3fabfd07b316..a37fe070bda0 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -283,7 +283,7 @@ bool Instruction::isSameOperationAs(const Instruction *I) const {
 /// specified block.  Note that PHI nodes are considered to evaluate their
 /// operands in the corresponding predecessor block.
 bool Instruction::isUsedOutsideOfBlock(const BasicBlock *BB) const {
-  for (use_const_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+  for (const_use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
     // PHI nodes uses values in the corresponding predecessor block.  For other
     // instructions, just check to see whether the parent of the use matches up.
     const PHINode *PN = dyn_cast<PHINode>(*UI);
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 8f4763f5a492..4609a64213bf 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -31,23 +31,18 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 #define CALLSITE_DELEGATE_GETTER(METHOD) \
-  Instruction *II(getInstruction());     \
+  Instruction *II = getInstruction();    \
   return isCall()                        \
     ? cast<CallInst>(II)->METHOD         \
     : cast<InvokeInst>(II)->METHOD
 
 #define CALLSITE_DELEGATE_SETTER(METHOD) \
-  Instruction *II(getInstruction());     \
+  Instruction *II = getInstruction();    \
   if (isCall())                          \
     cast<CallInst>(II)->METHOD;          \
   else                                   \
     cast<InvokeInst>(II)->METHOD
 
-CallSite::CallSite(Instruction *C) {
-  assert((isa<CallInst>(C) || isa<InvokeInst>(C)) && "Not a call!");
-  I.setPointer(C);
-  I.setInt(isa<CallInst>(C));
-}
 CallingConv::ID CallSite::getCallingConv() const {
   CALLSITE_DELEGATE_GETTER(getCallingConv());
 }
@@ -66,6 +61,17 @@ bool CallSite::paramHasAttr(uint16_t i, Attributes attr) const {
 uint16_t CallSite::getParamAlignment(uint16_t i) const {
   CALLSITE_DELEGATE_GETTER(getParamAlignment(i));
 }
+
+/// @brief Return true if the call should not be inlined.
+bool CallSite::isNoInline() const {
+  CALLSITE_DELEGATE_GETTER(isNoInline());
+}
+
+void CallSite::setIsNoInline(bool Value) {
+  CALLSITE_DELEGATE_GETTER(setIsNoInline(Value));
+}
+
+
 bool CallSite::doesNotAccessMemory() const {
   CALLSITE_DELEGATE_GETTER(doesNotAccessMemory());
 }
@@ -98,6 +104,13 @@ bool CallSite::hasArgument(const Value *Arg) const {
   return false;
 }
 
+User::op_iterator CallSite::getCallee() const {
+  Instruction *II(getInstruction());
+  return isCall()
+    ? cast<CallInst>(II)->op_begin()
+    : cast<InvokeInst>(II)->op_end() - 3; // Skip BB, BB, Function
+}
+
 #undef CALLSITE_DELEGATE_GETTER
 #undef CALLSITE_DELEGATE_SETTER
 
@@ -611,24 +624,24 @@ Instruction* CallInst::CreateFree(Value* Source, BasicBlock *InsertAtEnd) {
 void InvokeInst::init(Value *Fn, BasicBlock *IfNormal, BasicBlock *IfException,
                       Value* const *Args, unsigned NumArgs) {
   assert(NumOperands == 3+NumArgs && "NumOperands not set up?");
-  Use *OL = OperandList;
-  OL[0] = Fn;
-  OL[1] = IfNormal;
-  OL[2] = IfException;
+  Op<-3>() = Fn;
+  Op<-2>() = IfNormal;
+  Op<-1>() = IfException;
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Fn->getType())->getElementType());
   FTy = FTy;  // silence warning.
 
   assert(((NumArgs == FTy->getNumParams()) ||
           (FTy->isVarArg() && NumArgs > FTy->getNumParams())) &&
-         "Calling a function with bad signature");
+         "Invoking a function with bad signature");
 
+  Use *OL = OperandList;
   for (unsigned i = 0, e = NumArgs; i != e; i++) {
     assert((i >= FTy->getNumParams() || 
             FTy->getParamType(i) == Args[i]->getType()) &&
            "Invoking a function with a bad signature!");
     
-    OL[i+3] = Args[i];
+    OL[i] = Args[i];
   }
 }
 
diff --git a/lib/VMCore/LLVMContext.cpp b/lib/VMCore/LLVMContext.cpp
index 5a8ea5cf6d34..2a870ec6cfd9 100644
--- a/lib/VMCore/LLVMContext.cpp
+++ b/lib/VMCore/LLVMContext.cpp
@@ -26,19 +26,52 @@ LLVMContext& llvm::getGlobalContext() {
   return *GlobalContext;
 }
 
-LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) { }
+LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
+  // Create the first metadata kind, which is always 'dbg'.
+  unsigned DbgID = getMDKindID("dbg");
+  assert(DbgID == MD_dbg && "dbg kind id drifted"); (void)DbgID;
+}
 LLVMContext::~LLVMContext() { delete pImpl; }
 
-GetElementPtrConstantExpr::GetElementPtrConstantExpr
-  (Constant *C,
-   const std::vector<Constant*> &IdxList,
-   const Type *DestTy)
-    : ConstantExpr(DestTy, Instruction::GetElementPtr,
-                   OperandTraits<GetElementPtrConstantExpr>::op_end(this)
-                   - (IdxList.size()+1),
-                   IdxList.size()+1) {
-  OperandList[0] = C;
-  for (unsigned i = 0, E = IdxList.size(); i != E; ++i)
-    OperandList[i+1] = IdxList[i];
+
+#ifndef NDEBUG
+/// isValidName - Return true if Name is a valid custom metadata handler name.
+static bool isValidName(StringRef MDName) {
+  if (MDName.empty())
+    return false;
+  
+  if (!isalpha(MDName[0]))
+    return false;
+  
+  for (StringRef::iterator I = MDName.begin() + 1, E = MDName.end(); I != E;
+       ++I) {
+    if (!isalnum(*I) && *I != '_' && *I != '-' && *I != '.')
+      return false;
+  }
+  return true;
+}
+#endif
+
+/// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
+unsigned LLVMContext::getMDKindID(StringRef Name) const {
+  assert(isValidName(Name) && "Invalid MDNode name");
+  
+  unsigned &Entry = pImpl->CustomMDKindNames[Name];
+  
+  // If this is new, assign it its ID.
+  if (Entry == 0) Entry = pImpl->CustomMDKindNames.size();
+  return Entry;
 }
 
+/// getHandlerNames - Populate client supplied smallvector using custome
+/// metadata name and ID.
+void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const {
+  Names.resize(pImpl->CustomMDKindNames.size()+1);
+  Names[0] = "";
+  for (StringMap<unsigned>::const_iterator I = pImpl->CustomMDKindNames.begin(),
+       E = pImpl->CustomMDKindNames.end(); I != E; ++I)
+    // MD Handlers are numbered from 1.
+    Names[I->second] = I->first();
+}
+
+
diff --git a/lib/VMCore/LLVMContextImpl.cpp b/lib/VMCore/LLVMContextImpl.cpp
new file mode 100644
index 000000000000..b4553ddc1b9f
--- /dev/null
+++ b/lib/VMCore/LLVMContextImpl.cpp
@@ -0,0 +1,103 @@
+//===-- LLVMContextImpl.cpp - Implement LLVMContextImpl -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the opaque LLVMContextImpl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLVMContextImpl.h"
+#include <algorithm>
+
+LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
+  : TheTrueVal(0), TheFalseVal(0),
+    VoidTy(C, Type::VoidTyID),
+    LabelTy(C, Type::LabelTyID),
+    FloatTy(C, Type::FloatTyID),
+    DoubleTy(C, Type::DoubleTyID),
+    MetadataTy(C, Type::MetadataTyID),
+    X86_FP80Ty(C, Type::X86_FP80TyID),
+    FP128Ty(C, Type::FP128TyID),
+    PPC_FP128Ty(C, Type::PPC_FP128TyID),
+    Int1Ty(C, 1),
+    Int8Ty(C, 8),
+    Int16Ty(C, 16),
+    Int32Ty(C, 32),
+    Int64Ty(C, 64),
+    AlwaysOpaqueTy(new OpaqueType(C)) {
+  // Make sure the AlwaysOpaqueTy stays alive as long as the Context.
+  AlwaysOpaqueTy->addRef();
+  OpaqueTypes.insert(AlwaysOpaqueTy);
+}
+
+namespace {
+struct DropReferences {
+  // Takes the value_type of a ConstantUniqueMap's internal map, whose 'second'
+  // is a Constant*.
+  template<typename PairT>
+  void operator()(const PairT &P) {
+    P.second->dropAllReferences();
+  }
+};
+}
+
+LLVMContextImpl::~LLVMContextImpl() {
+  std::for_each(ExprConstants.map_begin(), ExprConstants.map_end(),
+                DropReferences());
+  std::for_each(ArrayConstants.map_begin(), ArrayConstants.map_end(),
+                DropReferences());
+  std::for_each(StructConstants.map_begin(), StructConstants.map_end(),
+                DropReferences());
+  std::for_each(UnionConstants.map_begin(), UnionConstants.map_end(),
+                DropReferences());
+  std::for_each(VectorConstants.map_begin(), VectorConstants.map_end(),
+                DropReferences());
+  ExprConstants.freeConstants();
+  ArrayConstants.freeConstants();
+  StructConstants.freeConstants();
+  UnionConstants.freeConstants();
+  VectorConstants.freeConstants();
+  AggZeroConstants.freeConstants();
+  NullPtrConstants.freeConstants();
+  UndefValueConstants.freeConstants();
+  InlineAsms.freeConstants();
+  for (IntMapTy::iterator I = IntConstants.begin(), E = IntConstants.end(); 
+       I != E; ++I) {
+    delete I->second;
+  }
+  for (FPMapTy::iterator I = FPConstants.begin(), E = FPConstants.end(); 
+       I != E; ++I) {
+    delete I->second;
+  }
+  AlwaysOpaqueTy->dropRef();
+  for (OpaqueTypesTy::iterator I = OpaqueTypes.begin(), E = OpaqueTypes.end();
+       I != E; ++I) {
+    (*I)->AbstractTypeUsers.clear();
+    delete *I;
+  }
+  // Destroy MDNodes.  ~MDNode can move and remove nodes between the MDNodeSet
+  // and the NonUniquedMDNodes sets, so copy the values out first.
+  SmallVector<MDNode*, 8> MDNodes;
+  MDNodes.reserve(MDNodeSet.size() + NonUniquedMDNodes.size());
+  for (FoldingSetIterator<MDNode> I = MDNodeSet.begin(), E = MDNodeSet.end();
+       I != E; ++I) {
+    MDNodes.push_back(&*I);
+  }
+  MDNodes.append(NonUniquedMDNodes.begin(), NonUniquedMDNodes.end());
+  for (SmallVector<MDNode*, 8>::iterator I = MDNodes.begin(),
+         E = MDNodes.end(); I != E; ++I) {
+    (*I)->destroy();
+  }
+  assert(MDNodeSet.empty() && NonUniquedMDNodes.empty() &&
+         "Destroying all MDNodes didn't empty the Context's sets.");
+  // Destroy MDStrings.
+  for (StringMap<MDString*>::iterator I = MDStringCache.begin(),
+         E = MDStringCache.end(); I != E; ++I) {
+    delete I->second;
+  }
+}
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index 9978f40ed0da..d4ebf8044d8b 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -1,4 +1,4 @@
-//===-- LLVMContextImpl.h - The LLVMContextImpl opaque class --------------===//
+//===-- LLVMContextImpl.h - The LLVMContextImpl opaque class ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,9 +19,9 @@
 #include "LeaksContext.h"
 #include "TypesContext.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Metadata.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/ADT/APFloat.h"
@@ -36,8 +36,6 @@ namespace llvm {
 
 class ConstantInt;
 class ConstantFP;
-class MDString;
-class MDNode;
 class LLVMContext;
 class Type;
 class Value;
@@ -92,6 +90,29 @@ struct DenseMapAPFloatKeyInfo {
   }
 };
 
+/// DebugRecVH - This is a CallbackVH used to keep the Scope -> index maps
+/// up to date as MDNodes mutate.  This class is implemented in DebugLoc.cpp.
+class DebugRecVH : public CallbackVH {
+  /// Ctx - This is the LLVM Context being referenced.
+  LLVMContextImpl *Ctx;
+  
+  /// Idx - The index into either ScopeRecordIdx or ScopeInlinedAtRecords that
+  /// this reference lives in.  If this is zero, then it represents a
+  /// non-canonical entry that has no DenseMap value.  This can happen due to
+  /// RAUW.
+  int Idx;
+public:
+  DebugRecVH(MDNode *n, LLVMContextImpl *ctx, int idx)
+    : CallbackVH(n), Ctx(ctx), Idx(idx) {}
+  
+  MDNode *get() const {
+    return cast_or_null<MDNode>(getValPtr());
+  }
+  
+  virtual void deleted();
+  virtual void allUsesReplacedWith(Value *VNew);
+};
+  
 class LLVMContextImpl {
 public:
   typedef DenseMap<DenseMapAPIntKeyInfo::KeyTy, ConstantInt*, 
@@ -130,11 +151,12 @@ public:
   VectorConstantsTy VectorConstants;
   
   ConstantUniqueMap<char, PointerType, ConstantPointerNull> NullPtrConstants;
-  
   ConstantUniqueMap<char, Type, UndefValue> UndefValueConstants;
   
   DenseMap<std::pair<Function*, BasicBlock*> , BlockAddress*> BlockAddresses;
   ConstantUniqueMap<ExprMapKeyType, Type, ConstantExpr> ExprConstants;
+
+  ConstantUniqueMap<InlineAsmKeyType, PointerType, InlineAsm> InlineAsms;
   
   ConstantInt *TheTrueVal;
   ConstantInt *TheFalseVal;
@@ -195,72 +217,29 @@ public:
   /// context.
   DenseMap<const Instruction *, MDMapTy> MetadataStore;
   
+  /// ScopeRecordIdx - This is the index in ScopeRecords for an MDNode scope
+  /// entry with no "inlined at" element.
+  DenseMap<MDNode*, int> ScopeRecordIdx;
   
-  LLVMContextImpl(LLVMContext &C) : TheTrueVal(0), TheFalseVal(0),
-    VoidTy(C, Type::VoidTyID),
-    LabelTy(C, Type::LabelTyID),
-    FloatTy(C, Type::FloatTyID),
-    DoubleTy(C, Type::DoubleTyID),
-    MetadataTy(C, Type::MetadataTyID),
-    X86_FP80Ty(C, Type::X86_FP80TyID),
-    FP128Ty(C, Type::FP128TyID),
-    PPC_FP128Ty(C, Type::PPC_FP128TyID),
-    Int1Ty(C, 1),
-    Int8Ty(C, 8),
-    Int16Ty(C, 16),
-    Int32Ty(C, 32),
-    Int64Ty(C, 64),
-    AlwaysOpaqueTy(new OpaqueType(C)) {
-    // Make sure the AlwaysOpaqueTy stays alive as long as the Context.
-    AlwaysOpaqueTy->addRef();
-    OpaqueTypes.insert(AlwaysOpaqueTy);
-  }
-
-  ~LLVMContextImpl() {
-    ExprConstants.freeConstants();
-    ArrayConstants.freeConstants();
-    StructConstants.freeConstants();
-    VectorConstants.freeConstants();
-    AggZeroConstants.freeConstants();
-    NullPtrConstants.freeConstants();
-    UndefValueConstants.freeConstants();
-    for (IntMapTy::iterator I = IntConstants.begin(), E = IntConstants.end(); 
-         I != E; ++I) {
-      if (I->second->use_empty())
-        delete I->second;
-    }
-    for (FPMapTy::iterator I = FPConstants.begin(), E = FPConstants.end(); 
-         I != E; ++I) {
-      if (I->second->use_empty())
-        delete I->second;
-    }
-    AlwaysOpaqueTy->dropRef();
-    for (OpaqueTypesTy::iterator I = OpaqueTypes.begin(), E = OpaqueTypes.end();
-        I != E; ++I) {
-      (*I)->AbstractTypeUsers.clear();
-      delete *I;
-    }
-    // Destroy MDNodes.  ~MDNode can move and remove nodes between the MDNodeSet
-    // and the NonUniquedMDNodes sets, so copy the values out first.
-    SmallVector<MDNode*, 8> MDNodes;
-    MDNodes.reserve(MDNodeSet.size() + NonUniquedMDNodes.size());
-    for (FoldingSetIterator<MDNode> I = MDNodeSet.begin(), E = MDNodeSet.end();
-         I != E; ++I) {
-      MDNodes.push_back(&*I);
-    }
-    MDNodes.append(NonUniquedMDNodes.begin(), NonUniquedMDNodes.end());
-    for (SmallVector<MDNode*, 8>::iterator I = MDNodes.begin(),
-           E = MDNodes.end(); I != E; ++I) {
-      (*I)->destroy();
-    }
-    assert(MDNodeSet.empty() && NonUniquedMDNodes.empty() &&
-           "Destroying all MDNodes didn't empty the Context's sets.");
-    // Destroy MDStrings.
-    for (StringMap<MDString*>::iterator I = MDStringCache.begin(),
-           E = MDStringCache.end(); I != E; ++I) {
-      delete I->second;
-    }
-  }
+  /// ScopeRecords - These are the actual mdnodes (in a value handle) for an
+  /// index.  The ValueHandle ensures that ScopeRecordIdx stays up to date if
+  /// the MDNode is RAUW'd.
+  std::vector<DebugRecVH> ScopeRecords;
+  
+  /// ScopeInlinedAtIdx - This is the index in ScopeInlinedAtRecords for an
+  /// scope/inlined-at pair.
+  DenseMap<std::pair<MDNode*, MDNode*>, int> ScopeInlinedAtIdx;
+  
+  /// ScopeInlinedAtRecords - These are the actual mdnodes (in value handles)
+  /// for an index.  The ValueHandle ensures that ScopeINlinedAtIdx stays up
+  /// to date.
+  std::vector<std::pair<DebugRecVH, DebugRecVH> > ScopeInlinedAtRecords;
+  
+  int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
+  int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
+  
+  LLVMContextImpl(LLVMContext &C);
+  ~LLVMContextImpl();
 };
 
 }
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index 06d4fd4ae5c6..73e60912e43b 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -182,19 +182,6 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
                           unsigned NumVals, FunctionLocalness FL,
                           bool Insert) {
   LLVMContextImpl *pImpl = Context.pImpl;
-  FoldingSetNodeID ID;
-  for (unsigned i = 0; i != NumVals; ++i)
-    ID.AddPointer(Vals[i]);
-
-  void *InsertPoint;
-  MDNode *N = NULL;
-  
-  if ((N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint)))
-    return N;
-    
-  if (!Insert)
-    return NULL;
-    
   bool isFunctionLocal = false;
   switch (FL) {
   case FL_Unknown:
@@ -216,6 +203,20 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
     break;
   }
 
+  FoldingSetNodeID ID;
+  for (unsigned i = 0; i != NumVals; ++i)
+    ID.AddPointer(Vals[i]);
+  ID.AddBoolean(isFunctionLocal);
+
+  void *InsertPoint;
+  MDNode *N = NULL;
+  
+  if ((N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint)))
+    return N;
+    
+  if (!Insert)
+    return NULL;
+    
   // Coallocate space for the node and Operands together, then placement new.
   void *Ptr = malloc(sizeof(MDNode)+NumVals*sizeof(MDNodeOperand));
   N = new (Ptr) MDNode(Context, Vals, NumVals, isFunctionLocal);
@@ -248,6 +249,7 @@ Value *MDNode::getOperand(unsigned i) const {
 void MDNode::Profile(FoldingSetNodeID &ID) const {
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
     ID.AddPointer(getOperand(i));
+  ID.AddBoolean(isFunctionLocal());
 }
 
 void MDNode::setIsNotUniqued() {
@@ -409,50 +411,6 @@ StringRef NamedMDNode::getName() const {
   return StringRef(Name);
 }
 
-//===----------------------------------------------------------------------===//
-// LLVMContext MDKind naming implementation.
-//
-
-#ifndef NDEBUG
-/// isValidName - Return true if Name is a valid custom metadata handler name.
-static bool isValidName(StringRef MDName) {
-  if (MDName.empty())
-    return false;
-
-  if (!isalpha(MDName[0]))
-    return false;
-
-  for (StringRef::iterator I = MDName.begin() + 1, E = MDName.end(); I != E;
-       ++I) {
-    if (!isalnum(*I) && *I != '_' && *I != '-' && *I != '.')
-        return false;
-  }
-  return true;
-}
-#endif
-
-/// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
-unsigned LLVMContext::getMDKindID(StringRef Name) const {
-  assert(isValidName(Name) && "Invalid MDNode name");
-
-  unsigned &Entry = pImpl->CustomMDKindNames[Name];
-
-  // If this is new, assign it its ID.
-  if (Entry == 0) Entry = pImpl->CustomMDKindNames.size();
-  return Entry;
-}
-
-/// getHandlerNames - Populate client supplied smallvector using custome
-/// metadata name and ID.
-void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const {
-  Names.resize(pImpl->CustomMDKindNames.size()+1);
-  Names[0] = "";
-  for (StringMap<unsigned>::const_iterator I = pImpl->CustomMDKindNames.begin(),
-       E = pImpl->CustomMDKindNames.end(); I != E; ++I)
-    // MD Handlers are numbered from 1.
-    Names[I->second] = I->first();
-}
-
 //===----------------------------------------------------------------------===//
 // Instruction Metadata method implementations.
 //
@@ -466,18 +424,29 @@ MDNode *Instruction::getMetadataImpl(const char *Kind) const {
   return getMetadataImpl(getContext().getMDKindID(Kind));
 }
 
+void Instruction::setDbgMetadata(MDNode *Node) {
+  DbgLoc = NewDebugLoc::getFromDILocation(Node);
+}
+
 /// setMetadata - Set the metadata of of the specified kind to the specified
 /// node.  This updates/replaces metadata if already present, or removes it if
 /// Node is null.
 void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
   if (Node == 0 && !hasMetadata()) return;
 
+  // Handle 'dbg' as a special case since it is not stored in the hash table.
+  if (KindID == LLVMContext::MD_dbg) {
+    DbgLoc = NewDebugLoc::getFromDILocation(Node);
+    return;
+  }
+  
   // Handle the case when we're adding/updating metadata on an instruction.
   if (Node) {
     LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
-    assert(!Info.empty() == hasMetadata() && "HasMetadata bit is wonked");
+    assert(!Info.empty() == hasMetadataHashEntry() &&
+           "HasMetadata bit is wonked");
     if (Info.empty()) {
-      setHasMetadata(true);
+      setHasMetadataHashEntry(true);
     } else {
       // Handle replacement of an existing value.
       for (unsigned i = 0, e = Info.size(); i != e; ++i)
@@ -493,18 +462,19 @@ void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
   }
 
   // Otherwise, we're removing metadata from an instruction.
-  assert(hasMetadata() && getContext().pImpl->MetadataStore.count(this) &&
+  assert(hasMetadataHashEntry() &&
+         getContext().pImpl->MetadataStore.count(this) &&
          "HasMetadata bit out of date!");
   LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
 
   // Common case is removing the only entry.
   if (Info.size() == 1 && Info[0].first == KindID) {
     getContext().pImpl->MetadataStore.erase(this);
-    setHasMetadata(false);
+    setHasMetadataHashEntry(false);
     return;
   }
 
-  // Handle replacement of an existing value.
+  // Handle removal of an existing value.
   for (unsigned i = 0, e = Info.size(); i != e; ++i)
     if (Info[i].first == KindID) {
       Info[i] = Info.back();
@@ -516,8 +486,14 @@ void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
 }
 
 MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
+  // Handle 'dbg' as a special case since it is not stored in the hash table.
+  if (KindID == LLVMContext::MD_dbg)
+    return DbgLoc.getAsMDNode(getContext());
+  
+  if (!hasMetadataHashEntry()) return 0;
+  
   LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
-  assert(hasMetadata() && !Info.empty() && "Shouldn't have called this");
+  assert(!Info.empty() && "bit out of sync with hash table");
 
   for (LLVMContextImpl::MDMapTy::iterator I = Info.begin(), E = Info.end();
        I != E; ++I)
@@ -527,14 +503,23 @@ MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
 }
 
 void Instruction::getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned,
-                                       MDNode*> > &Result)const {
-  assert(hasMetadata() && getContext().pImpl->MetadataStore.count(this) &&
+                                       MDNode*> > &Result) const {
+  Result.clear();
+  
+  // Handle 'dbg' as a special case since it is not stored in the hash table.
+  if (!DbgLoc.isUnknown()) {
+    Result.push_back(std::make_pair((unsigned)LLVMContext::MD_dbg,
+                                    DbgLoc.getAsMDNode(getContext())));
+    if (!hasMetadataHashEntry()) return;
+  }
+  
+  assert(hasMetadataHashEntry() &&
+         getContext().pImpl->MetadataStore.count(this) &&
          "Shouldn't have called this");
   const LLVMContextImpl::MDMapTy &Info =
     getContext().pImpl->MetadataStore.find(this)->second;
   assert(!Info.empty() && "Shouldn't have called this");
 
-  Result.clear();
   Result.append(Info.begin(), Info.end());
 
   // Sort the resulting array so it is stable.
@@ -542,10 +527,32 @@ void Instruction::getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned,
     array_pod_sort(Result.begin(), Result.end());
 }
 
+void Instruction::
+getAllMetadataOtherThanDebugLocImpl(SmallVectorImpl<std::pair<unsigned,
+                                    MDNode*> > &Result) const {
+  Result.clear();
+  assert(hasMetadataHashEntry() &&
+         getContext().pImpl->MetadataStore.count(this) &&
+         "Shouldn't have called this");
+  const LLVMContextImpl::MDMapTy &Info =
+  getContext().pImpl->MetadataStore.find(this)->second;
+  assert(!Info.empty() && "Shouldn't have called this");
+  
+  Result.append(Info.begin(), Info.end());
+  
+  // Sort the resulting array so it is stable.
+  if (Result.size() > 1)
+    array_pod_sort(Result.begin(), Result.end());
+}
+
+
 /// removeAllMetadata - Remove all metadata from this instruction.
 void Instruction::removeAllMetadata() {
   assert(hasMetadata() && "Caller should check");
-  getContext().pImpl->MetadataStore.erase(this);
-  setHasMetadata(false);
+  DbgLoc = NewDebugLoc();
+  if (hasMetadataHashEntry()) {
+    getContext().pImpl->MetadataStore.erase(this);
+    setHasMetadataHashEntry(false);
+  }
 }
 
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index 001bb00f26ba..94840f07a4ab 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -82,7 +82,7 @@ Module::Endianness Module::getEndianness() const {
   
   while (!temp.empty()) {
     StringRef token = DataLayout;
-    tie(token, temp) = getToken(DataLayout, "-");
+    tie(token, temp) = getToken(temp, "-");
     
     if (token[0] == 'e') {
       ret = LittleEndian;
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index c4dfe1409b94..6774cecdcf24 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -378,17 +378,19 @@ namespace {
 static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
 
 class TimingInfo {
-  std::map<Pass*, Timer> TimingData;
+  DenseMap<Pass*, Timer*> TimingData;
   TimerGroup TG;
-
 public:
   // Use 'create' member to get this.
   TimingInfo() : TG("... Pass execution timing report ...") {}
   
   // TimingDtor - Print out information about timing information
   ~TimingInfo() {
-    // Delete all of the timers...
-    TimingData.clear();
+    // Delete all of the timers, which accumulate their info into the
+    // TimerGroup.
+    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
+         E = TimingData.end(); I != E; ++I)
+      delete I->second;
     // TimerGroup is deleted next, printing the report.
   }
 
@@ -397,18 +399,15 @@ public:
   // null.  It may be called multiple times.
   static void createTheTimeInfo();
 
-  /// passStarted - This method creates a timer for the given pass if it doesn't
-  /// already have one, and starts the timer.
-  Timer *passStarted(Pass *P) {
+  /// getPassTimer - Return the timer for the specified pass if it exists.
+  Timer *getPassTimer(Pass *P) {
     if (P->getAsPMDataManager()) 
       return 0;
 
     sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
-    std::map<Pass*, Timer>::iterator I = TimingData.find(P);
-    if (I == TimingData.end())
-      I=TimingData.insert(std::make_pair(P, Timer(P->getPassName(), TG))).first;
-    Timer *T = &I->second;
-    T->startTimer();
+    Timer *&T = TimingData[P];
+    if (T == 0)
+      T = new Timer(P->getPassName(), TG);
     return T;
   }
 };
@@ -704,11 +703,8 @@ void PMDataManager::verifyPreservedAnalysis(Pass *P) {
          E = PreservedSet.end(); I != E; ++I) {
     AnalysisID AID = *I;
     if (Pass *AP = findAnalysisPass(AID, true)) {
-
-      Timer *T = 0;
-      if (TheTimeInfo) T = TheTimeInfo->passStarted(AP);
+      TimeRegion PassTimer(getPassTimer(AP));
       AP->verifyAnalysis();
-      if (T) T->stopTimer();
     }
   }
 }
@@ -792,10 +788,9 @@ void PMDataManager::freePass(Pass *P, StringRef Msg,
   {
     // If the pass crashes releasing memory, remember this.
     PassManagerPrettyStackEntry X(P);
-    
-    Timer *T = StartPassTimer(P);
+    TimeRegion PassTimer(getPassTimer(P));
+
     P->releaseMemory();
-    StopPassTimer(P, T);
   }
 
   if (const PassInfo *PI = P->getPassInfo()) {
@@ -1128,10 +1123,9 @@ bool BBPassManager::runOnFunction(Function &F) {
       {
         // If the pass crashes, remember this.
         PassManagerPrettyStackEntry X(BP, *I);
-      
-        Timer *T = StartPassTimer(BP);
+        TimeRegion PassTimer(getPassTimer(BP));
+
         LocalChanged |= BP->runOnBasicBlock(*I);
-        StopPassTimer(BP, T);
       }
 
       Changed |= LocalChanged;
@@ -1345,10 +1339,9 @@ bool FPPassManager::runOnFunction(Function &F) {
 
     {
       PassManagerPrettyStackEntry X(FP, F);
+      TimeRegion PassTimer(getPassTimer(FP));
 
-      Timer *T = StartPassTimer(FP);
       LocalChanged |= FP->runOnFunction(F);
-      StopPassTimer(FP, T);
     }
 
     Changed |= LocalChanged;
@@ -1420,9 +1413,9 @@ MPPassManager::runOnModule(Module &M) {
 
     {
       PassManagerPrettyStackEntry X(MP, M);
-      Timer *T = StartPassTimer(MP);
+      TimeRegion PassTimer(getPassTimer(MP));
+
       LocalChanged |= MP->runOnModule(M);
-      StopPassTimer(MP, T);
     }
 
     Changed |= LocalChanged;
@@ -1559,17 +1552,12 @@ void TimingInfo::createTheTimeInfo() {
 }
 
 /// If TimingInfo is enabled then start pass timer.
-Timer *llvm::StartPassTimer(Pass *P) {
+Timer *llvm::getPassTimer(Pass *P) {
   if (TheTimeInfo) 
-    return TheTimeInfo->passStarted(P);
+    return TheTimeInfo->getPassTimer(P);
   return 0;
 }
 
-/// If TimingInfo is enabled then stop pass timer.
-void llvm::StopPassTimer(Pass *P, Timer *T) {
-  if (T) T->stopTimer();
-}
-
 //===----------------------------------------------------------------------===//
 // PMStack implementation
 //
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index 2a0cfa8470ea..5f9c11fc0ed9 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -57,6 +57,11 @@ void AbstractTypeUser::setType(Value *V, const Type *NewTy) {
 /// need for a std::vector to be used in the Type class itself. 
 /// @brief Type destruction function
 void Type::destroy() const {
+  // Nothing calls getForwardedType from here on.
+  if (ForwardType && ForwardType->isAbstract()) {
+    ForwardType->dropRef();
+    ForwardType = NULL;
+  }
 
   // Structures and Functions allocate their contained types past the end of
   // the type object itself. These need to be destroyed differently than the
@@ -87,11 +92,6 @@ void Type::destroy() const {
     pImpl->OpaqueTypes.erase(opaque_this);
   }
 
-  if (ForwardType && ForwardType->isAbstract()) {
-    ForwardType->dropRef();
-    ForwardType = NULL;
-  }
-
   // For all the other type subclasses, there is either no contained types or 
   // just one (all Sequentials). For Sequentials, the PATypeHandle is not
   // allocated past the type object, its included directly in the SequentialType
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index a36d2628250d..645dd5a21c74 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -86,7 +86,7 @@ Value::~Value() {
 /// hasNUses - Return true if this Value has exactly N users.
 ///
 bool Value::hasNUses(unsigned N) const {
-  use_const_iterator UI = use_begin(), E = use_end();
+  const_use_iterator UI = use_begin(), E = use_end();
 
   for (; N; --N, ++UI)
     if (UI == E) return false;  // Too few.
@@ -97,7 +97,7 @@ bool Value::hasNUses(unsigned N) const {
 /// logically equivalent to getNumUses() >= N.
 ///
 bool Value::hasNUsesOrMore(unsigned N) const {
-  use_const_iterator UI = use_begin(), E = use_end();
+  const_use_iterator UI = use_begin(), E = use_end();
 
   for (; N; --N, ++UI)
     if (UI == E) return false;  // Too few.
@@ -108,7 +108,7 @@ bool Value::hasNUsesOrMore(unsigned N) const {
 /// isUsedInBasicBlock - Return true if this value is used in the specified
 /// basic block.
 bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
-  for (use_const_iterator I = use_begin(), E = use_end(); I != E; ++I) {
+  for (const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) {
     const Instruction *User = dyn_cast<Instruction>(*I);
     if (User && User->getParent() == BB)
       return true;
diff --git a/lib/VMCore/ValueSymbolTable.cpp b/lib/VMCore/ValueSymbolTable.cpp
index d30a9d6e37b3..449d61a2cbb1 100644
--- a/lib/VMCore/ValueSymbolTable.cpp
+++ b/lib/VMCore/ValueSymbolTable.cpp
@@ -55,9 +55,7 @@ void ValueSymbolTable::reinsertValue(Value* V) {
     raw_svector_ostream(UniqueName) << ++LastUnique;
 
     // Try insert the vmap entry with this suffix.
-    ValueName &NewName =
-      vmap.GetOrCreateValue(StringRef(UniqueName.data(),
-                                      UniqueName.size()));
+    ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
     if (NewName.getValue() == 0) {
       // Newly inserted name.  Success!
       NewName.setValue(V);
@@ -88,7 +86,7 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
   }
   
   // Otherwise, there is a naming conflict.  Rename this value.
-  SmallString<128> UniqueName(Name.begin(), Name.end());
+  SmallString<256> UniqueName(Name.begin(), Name.end());
   
   while (1) {
     // Trim any suffix off and append the next number.
@@ -96,9 +94,7 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
     raw_svector_ostream(UniqueName) << ++LastUnique;
     
     // Try insert the vmap entry with this suffix.
-    ValueName &NewName =
-      vmap.GetOrCreateValue(StringRef(UniqueName.data(),
-                                      UniqueName.size()));
+    ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
     if (NewName.getValue() == 0) {
       // Newly inserted name.  Success!
       NewName.setValue(V);
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index f1413825697a..c18168d8a5c7 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -676,17 +676,13 @@ void Verifier::visitFunction(Function &F) {
               "blockaddress may not be used with the entry block!", Entry);
     }
   }
-  
+ 
   // If this function is actually an intrinsic, verify that it is only used in
   // direct call/invokes, never having its "address taken".
   if (F.getIntrinsicID()) {
-    for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E;++UI){
-      User *U = cast<User>(UI);
-      if ((isa<CallInst>(U) || isa<InvokeInst>(U)) && UI.getOperandNo() == 0)
-        continue;  // Direct calls/invokes are ok.
-      
+    const User *U;
+    if (F.hasAddressTaken(&U))
       Assert1(0, "Invalid user of intrinsic instruction!", U); 
-    }
   }
 }
 
@@ -1483,7 +1479,7 @@ void Verifier::visitInstruction(Instruction &I) {
                 "Instruction does not dominate all uses!", Op, &I);
       }
     } else if (isa<InlineAsm>(I.getOperand(i))) {
-      Assert1(i == 0 && (isa<CallInst>(I) || isa<InvokeInst>(I)),
+      Assert1((i == 0 && isa<CallInst>(I)) || (i + 3 == e && isa<InvokeInst>(I)),
               "Cannot take the address of an inline asm!", &I);
     }
   }
@@ -1683,13 +1679,11 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
 /// parameters beginning with NumRets.
 ///
 static std::string IntrinsicParam(unsigned ArgNo, unsigned NumRets) {
-  if (ArgNo < NumRets) {
-    if (NumRets == 1)
-      return "Intrinsic result type";
-    else
-      return "Intrinsic result type #" + utostr(ArgNo);
-  } else
+  if (ArgNo >= NumRets)
     return "Intrinsic parameter #" + utostr(ArgNo - NumRets);
+  if (NumRets == 1)
+    return "Intrinsic result type";
+  return "Intrinsic result type #" + utostr(ArgNo);
 }
 
 bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
@@ -1706,9 +1700,13 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
 
   const Type *RetTy = FTy->getReturnType();
   const StructType *ST = dyn_cast<StructType>(RetTy);
-  unsigned NumRets = 1;
-  if (ST)
-    NumRets = ST->getNumElements();
+  unsigned NumRetVals;
+  if (RetTy->isVoidTy())
+    NumRetVals = 0;
+  else if (ST)
+    NumRetVals = ST->getNumElements();
+  else
+    NumRetVals = 1;
 
   if (VT < 0) {
     int Match = ~VT;
@@ -1720,7 +1718,7 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
                   TruncatedElementVectorType)) != 0) {
       const IntegerType *IEltTy = dyn_cast<IntegerType>(EltTy);
       if (!VTy || !IEltTy) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not "
+        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
                     "an integral vector type.", F);
         return false;
       }
@@ -1728,7 +1726,7 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
       // the type being matched against.
       if ((Match & ExtendedElementVectorType) != 0) {
         if ((IEltTy->getBitWidth() & 1) != 0) {
-          CheckFailed(IntrinsicParam(ArgNo, NumRets) + " vector "
+          CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " vector "
                       "element bit-width is odd.", F);
           return false;
         }
@@ -1738,25 +1736,25 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
       Match &= ~(ExtendedElementVectorType | TruncatedElementVectorType);
     }
 
-    if (Match <= static_cast<int>(NumRets - 1)) {
+    if (Match <= static_cast<int>(NumRetVals - 1)) {
       if (ST)
         RetTy = ST->getElementType(Match);
 
       if (Ty != RetTy) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRets) + " does not "
+        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not "
                     "match return type.", F);
         return false;
       }
     } else {
-      if (Ty != FTy->getParamType(Match - NumRets)) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRets) + " does not "
-                    "match parameter %" + utostr(Match - NumRets) + ".", F);
+      if (Ty != FTy->getParamType(Match - NumRetVals)) {
+        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not "
+                    "match parameter %" + utostr(Match - NumRetVals) + ".", F);
         return false;
       }
     }
   } else if (VT == MVT::iAny) {
     if (!EltTy->isIntegerTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not "
+      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
                   "an integer type.", F);
       return false;
     }
@@ -1781,7 +1779,7 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
     }
   } else if (VT == MVT::fAny) {
     if (!EltTy->isFloatingPointTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not "
+      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
                   "a floating-point type.", F);
       return false;
     }
@@ -1794,13 +1792,14 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
     Suffix += EVT::getEVT(EltTy).getEVTString();
   } else if (VT == MVT::vAny) {
     if (!VTy) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not a vector type.", F);
+      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a vector type.",
+                  F);
       return false;
     }
     Suffix += ".v" + utostr(NumElts) + EVT::getEVT(EltTy).getEVTString();
   } else if (VT == MVT::iPTR) {
     if (!Ty->isPointerTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not a "
+      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a "
                   "pointer and a pointer is required.", F);
       return false;
     }
@@ -1812,7 +1811,7 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
       Suffix += ".p" + utostr(PTyp->getAddressSpace()) + 
         EVT::getEVT(PTyp->getElementType()).getEVTString();
     } else {
-      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not a "
+      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a "
                   "pointer and a pointer is required.", F);
       return false;
     }
@@ -1832,10 +1831,10 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
     }
   } else if (EVT((MVT::SimpleValueType)VT).getTypeForEVT(Ty->getContext()) != 
              EltTy) {
-    CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is wrong!", F);
+    CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is wrong!", F);
     return false;
   } else if (EltTy != Ty) {
-    CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is a vector "
+    CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is a vector "
                 "and a scalar is required.", F);
     return false;
   }
@@ -1847,10 +1846,10 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
 /// Intrinsics.gen.  This implements a little state machine that verifies the
 /// prototype of intrinsics.
 void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
-                                        unsigned RetNum,
-                                        unsigned ParamNum, ...) {
+                                        unsigned NumRetVals,
+                                        unsigned NumParams, ...) {
   va_list VA;
-  va_start(VA, ParamNum);
+  va_start(VA, NumParams);
   const FunctionType *FTy = F->getFunctionType();
 
   // For overloaded intrinsics, the Suffix of the function name must match the
@@ -1858,7 +1857,7 @@ void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
   // suffix, to be checked at the end.
   std::string Suffix;
 
-  if (FTy->getNumParams() + FTy->isVarArg() != ParamNum) {
+  if (FTy->getNumParams() + FTy->isVarArg() != NumParams) {
     CheckFailed("Intrinsic prototype has incorrect number of arguments!", F);
     return;
   }
@@ -1866,23 +1865,27 @@ void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
   const Type *Ty = FTy->getReturnType();
   const StructType *ST = dyn_cast<StructType>(Ty);
 
+  if (NumRetVals == 0 && !Ty->isVoidTy()) {
+    CheckFailed("Intrinsic should return void", F);
+    return;
+  }
+  
   // Verify the return types.
-  if (ST && ST->getNumElements() != RetNum) {
+  if (ST && ST->getNumElements() != NumRetVals) {
     CheckFailed("Intrinsic prototype has incorrect number of return types!", F);
     return;
   }
-
-  for (unsigned ArgNo = 0; ArgNo < RetNum; ++ArgNo) {
+  
+  for (unsigned ArgNo = 0; ArgNo != NumRetVals; ++ArgNo) {
     int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
 
     if (ST) Ty = ST->getElementType(ArgNo);
-
     if (!PerformTypeCheck(ID, F, Ty, VT, ArgNo, Suffix))
       break;
   }
 
   // Verify the parameter types.
-  for (unsigned ArgNo = 0; ArgNo < ParamNum; ++ArgNo) {
+  for (unsigned ArgNo = 0; ArgNo != NumParams; ++ArgNo) {
     int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
 
     if (VT == MVT::isVoid && ArgNo > 0) {
@@ -1891,8 +1894,8 @@ void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
       break;
     }
 
-    if (!PerformTypeCheck(ID, F, FTy->getParamType(ArgNo), VT, ArgNo + RetNum,
-                          Suffix))
+    if (!PerformTypeCheck(ID, F, FTy->getParamType(ArgNo), VT,
+                          ArgNo + NumRetVals, Suffix))
       break;
   }
 
diff --git a/test/Bitcode/sse41_pmulld.ll b/test/Bitcode/sse41_pmulld.ll
new file mode 100644
index 000000000000..caf85479bbc2
--- /dev/null
+++ b/test/Bitcode/sse41_pmulld.ll
@@ -0,0 +1,2 @@
+; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.pmulld}
+; RUN: llvm-dis < %s.bc | grep mul
\ No newline at end of file
diff --git a/test/Bitcode/sse41_pmulld.ll.bc b/test/Bitcode/sse41_pmulld.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..bd66f0a05ca62ce3e05462d1beb5ad790ae9c48f
GIT binary patch
literal 560
zcmcb2K%AQa2t<Hbf{B5Foq>VT$@#!NMIHu*W}q0O00V;%kY;aGp6Gaj#gk3I$Z`^6
z3y&%*hol@!cT$6ZE0fbCZl@LjBM}P@2~RE~7Zne#35v=`f*N=v6^m5ZPjCn(tyr+a
z;n)X8pgEor3=FJ5n!Dvc5Xc{ph*J0iVllWeFa#<vFz^E9jFdPy6;hj87}Q)mSUJ^P
zj&TSpJ!Vo=6lOVTz~soI%)u&oSB6pIj9`P4B*Ow8K|`Rsh4pF<FkAm%wO-I{#=&0E
zz+R@&Zl=&)FrmHLgT1_?*`k2i;>H1?rx(~hEMPBlV6RGGF99=hC-4<6V6On83v4eN
z*sH(@B$&WnCBR;`fE~yHa*Gw%s|?tS7O(>uKrV_238=COXtMmMiunr_K-MA13v3S=
z*sFjD=%fp5pCJs0+Y<OHfLsCgq6R3L`+yImHv#GxpvlgY7#SLXftKK*#K5QkWVAE_
z2?kK43bP#K-JpJ;FO<VEKp_3dGeJHU1{R>WngD~D01JD7z*&(*kXjHZRA69W1JY~`
eJ}oAZ4qJ6ZAII-`=PlHD{omn$(?%dAU;qHnxst5_

literal 0
HcmV?d00001

diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index e5b5791b3cda..f03282bdab7f 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index db18a86eccd8..749690e98d0f 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index a5c86bf26343..0c3149579297 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
index 904a58739370..f8b47b5bac0d 100644
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
diff --git a/test/CodeGen/ARM/fmscs.ll b/test/CodeGen/ARM/fmscs.ll
index 7b9e029b676e..7a70543dee6c 100644
--- a/test/CodeGen/ARM/fmscs.ll
+++ b/test/CodeGen/ARM/fmscs.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index d3c9c82e9745..ef4e3e52818e 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
diff --git a/test/CodeGen/ARM/fnegs.ll b/test/CodeGen/ARM/fnegs.ll
index d6c22f14a4ca..c15005e6e8ab 100644
--- a/test/CodeGen/ARM/fnegs.ll
+++ b/test/CodeGen/ARM/fnegs.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
diff --git a/test/CodeGen/ARM/fnmacs.ll b/test/CodeGen/ARM/fnmacs.ll
index 724947ea04d3..1d1d06a70ea6 100644
--- a/test/CodeGen/ARM/fnmacs.ll
+++ b/test/CodeGen/ARM/fnmacs.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NEONFP
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=NEONFP
 
 define float @test(float %acc, float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
index ad2188218e4d..6b7cefa6414d 100644
--- a/test/CodeGen/ARM/fnmscs.ll
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
 
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
index 2adac78cf806..1ef9f7f32164 100644
--- a/test/CodeGen/ARM/fp_convert.ll
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=VFP2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=VFP2
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=NEON
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=VFP2
 
diff --git a/test/CodeGen/ARM/fsubs.ll b/test/CodeGen/ARM/fsubs.ll
index ae98be307892..bea8d5f4f30b 100644
--- a/test/CodeGen/ARM/fsubs.ll
+++ b/test/CodeGen/ARM/fsubs.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=NFP1
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
 
 define float @test(float %a, float %b) {
 entry:
diff --git a/test/CodeGen/CellSPU/bigstack.ll b/test/CodeGen/CellSPU/bigstack.ll
new file mode 100644
index 000000000000..5483f463732b
--- /dev/null
+++ b/test/CodeGen/CellSPU/bigstack.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=cellspu -o %t1.s
+; RUN: grep lqx   %t1.s | count 4
+; RUN: grep il    %t1.s | grep -v file | count 7
+; RUN: grep stqx  %t1.s | count 2
+
+define i32 @bigstack() nounwind {
+entry:
+  %avar = alloca i32                            
+  %big_data = alloca [2048 x i32]                
+  store i32 3840, i32* %avar, align 4
+  br label %return
+
+return:                                          
+  %retval = load i32* %avar                
+  ret i32 %retval
+}
+
diff --git a/test/CodeGen/Generic/2010-ZeroSizedArg.ll b/test/CodeGen/Generic/2010-ZeroSizedArg.ll
new file mode 100644
index 000000000000..ba40bd08e8e9
--- /dev/null
+++ b/test/CodeGen/Generic/2010-ZeroSizedArg.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s
+; PR4975
+
+%0 = type <{ [0 x i32] }>
+%union.T0 = type { }
+
+@.str = private constant [1 x i8] c" "
+
+define arm_apcscc void @t(%0) nounwind {
+entry:
+  %arg0 = alloca %union.T0
+  %1 = bitcast %union.T0* %arg0 to %0*
+  store %0 %0, %0* %1, align 1
+  ret void
+}
+
+declare arm_apcscc i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Generic/addr-label.ll b/test/CodeGen/Generic/addr-label.ll
index 51741110e072..0dbe5021bbf0 100644
--- a/test/CodeGen/Generic/addr-label.ll
+++ b/test/CodeGen/Generic/addr-label.ll
@@ -56,3 +56,26 @@ ret:
 	ret i32 -1
 }
 
+
+; PR6673
+
+define i64 @test4a() {
+	%target = bitcast i8* blockaddress(@test4b, %usermain) to i8*
+	%ret = call i64 @test4b(i8* %target)
+
+	ret i64 %ret
+}
+
+define i64 @test4b(i8* %Code) {
+entry:
+	indirectbr i8* %Code, [label %usermain]
+usermain:
+	br label %label_line_0
+
+label_line_0:
+	br label %label_line_1
+
+label_line_1:
+	%target = ptrtoint i8* blockaddress(@test4b, %label_line_0) to i64
+	ret i64 %target
+}
diff --git a/test/CodeGen/PIC16/2009-07-17-PR4566-pic16.ll b/test/CodeGen/PIC16/2009-07-17-PR4566-pic16.ll
index b508026c21f6..5b5e11f2df0c 100644
--- a/test/CodeGen/PIC16/2009-07-17-PR4566-pic16.ll
+++ b/test/CodeGen/PIC16/2009-07-17-PR4566-pic16.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=pic16 | FileCheck %s
+; XFAIL: vg_leak
 
 target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8-f32:32:32"
 target triple = "pic16-"
diff --git a/test/CodeGen/PIC16/2009-11-20-NewNode.ll b/test/CodeGen/PIC16/2009-11-20-NewNode.ll
index d68f0f41c4a5..f9d66ca63471 100644
--- a/test/CodeGen/PIC16/2009-11-20-NewNode.ll
+++ b/test/CodeGen/PIC16/2009-11-20-NewNode.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=pic16 < %s
 ; PR5558
+; XFAIL: vg_leak
 
 define i64 @_strtoll_r(i16 %base) nounwind {
 entry:
diff --git a/test/CodeGen/PIC16/C16-15.ll b/test/CodeGen/PIC16/C16-15.ll
index 5ca2d4a9bd63..020b0dd6743e 100644
--- a/test/CodeGen/PIC16/C16-15.ll
+++ b/test/CodeGen/PIC16/C16-15.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=pic16 | grep "extern" | grep "@.lib.unordered.f32" | count 3
+; XFAIL: vg_leak
 
 @pc = global i8* inttoptr (i64 160 to i8*), align 1 ; <i8**> [#uses=2]
 @aa = common global i16 0, align 1                ; <i16*> [#uses=0]
diff --git a/test/CodeGen/PIC16/global-in-user-section.ll b/test/CodeGen/PIC16/global-in-user-section.ll
index 74c9d9d256cd..6cdb64864ad5 100644
--- a/test/CodeGen/PIC16/global-in-user-section.ll
+++ b/test/CodeGen/PIC16/global-in-user-section.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=pic16 | FileCheck %s
+; XFAIL: vg_leak
 
 @G1 = common global i16 0, section "usersection", align 1 
 ; CHECK: usersection UDATA
diff --git a/test/CodeGen/PIC16/globals.ll b/test/CodeGen/PIC16/globals.ll
index 432c291078d3..3ee2e25265d3 100644
--- a/test/CodeGen/PIC16/globals.ll
+++ b/test/CodeGen/PIC16/globals.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=pic16 | FileCheck %s
+; XFAIL: vg_leak
 
 @G1 = global i32 4712, section "Address=412"
 ; CHECK: @G1.412..user_section.#	IDATA	412
diff --git a/test/CodeGen/PIC16/sext.ll b/test/CodeGen/PIC16/sext.ll
index b49925ffb7c3..e51a54287ce6 100644
--- a/test/CodeGen/PIC16/sext.ll
+++ b/test/CodeGen/PIC16/sext.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=pic16
+; XFAIL: vg_leak
 
 @main.auto.c = internal global i8 0		; <i8*> [#uses=1]
 
diff --git a/test/CodeGen/PowerPC/2010-04-01-MachineCSEBug.ll b/test/CodeGen/PowerPC/2010-04-01-MachineCSEBug.ll
new file mode 100644
index 000000000000..8fd05502ba86
--- /dev/null
+++ b/test/CodeGen/PowerPC/2010-04-01-MachineCSEBug.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin10.0
+; rdar://7819990
+
+%0 = type { i32 }
+%1 = type { i64 }
+%struct.Buffer = type { [1024 x i8], i64, i64, i64 }
+%struct.InStream = type { %struct.Buffer, %0, %1, i32*, %struct.InStreamMethods* }
+%struct.InStreamMethods = type { void (%struct.InStream*, i8*, i32)*, void (%struct.InStream*, i64)*, i64 (%struct.InStream*)*, void (%struct.InStream*)* }
+
+define i64 @t(%struct.InStream* %is) nounwind optsize ssp {
+entry:
+  br i1 undef, label %is_read_byte.exit, label %bb.i
+
+bb.i:                                             ; preds = %entry
+  br label %is_read_byte.exit
+
+is_read_byte.exit:                                ; preds = %bb.i, %entry
+  br i1 undef, label %is_read_byte.exit22, label %bb.i21
+
+bb.i21:                                           ; preds = %is_read_byte.exit
+  unreachable
+
+is_read_byte.exit22:                              ; preds = %is_read_byte.exit
+  br i1 undef, label %is_read_byte.exit19, label %bb.i18
+
+bb.i18:                                           ; preds = %is_read_byte.exit22
+  br label %is_read_byte.exit19
+
+is_read_byte.exit19:                              ; preds = %bb.i18, %is_read_byte.exit22
+  br i1 undef, label %is_read_byte.exit16, label %bb.i15
+
+bb.i15:                                           ; preds = %is_read_byte.exit19
+  unreachable
+
+is_read_byte.exit16:                              ; preds = %is_read_byte.exit19
+  %0 = shl i64 undef, 32                          ; <i64> [#uses=1]
+  br i1 undef, label %is_read_byte.exit13, label %bb.i12
+
+bb.i12:                                           ; preds = %is_read_byte.exit16
+  unreachable
+
+is_read_byte.exit13:                              ; preds = %is_read_byte.exit16
+  %1 = shl i64 undef, 24                          ; <i64> [#uses=1]
+  br i1 undef, label %is_read_byte.exit10, label %bb.i9
+
+bb.i9:                                            ; preds = %is_read_byte.exit13
+  unreachable
+
+is_read_byte.exit10:                              ; preds = %is_read_byte.exit13
+  %2 = shl i64 undef, 16                          ; <i64> [#uses=1]
+  br i1 undef, label %is_read_byte.exit7, label %bb.i6
+
+bb.i6:                                            ; preds = %is_read_byte.exit10
+  br label %is_read_byte.exit7
+
+is_read_byte.exit7:                               ; preds = %bb.i6, %is_read_byte.exit10
+  %3 = shl i64 undef, 8                           ; <i64> [#uses=1]
+  br i1 undef, label %is_read_byte.exit4, label %bb.i3
+
+bb.i3:                                            ; preds = %is_read_byte.exit7
+  unreachable
+
+is_read_byte.exit4:                               ; preds = %is_read_byte.exit7
+  %4 = or i64 0, %0                               ; <i64> [#uses=1]
+  %5 = or i64 %4, %1                              ; <i64> [#uses=1]
+  %6 = or i64 %5, %2                              ; <i64> [#uses=1]
+  %7 = or i64 %6, %3                              ; <i64> [#uses=1]
+  %8 = or i64 %7, 0                               ; <i64> [#uses=1]
+  ret i64 %8
+}
diff --git a/test/CodeGen/PowerPC/eqv-andc-orc-nor.ll b/test/CodeGen/PowerPC/eqv-andc-orc-nor.ll
index 558fd1b3199b..f99089b3bb02 100644
--- a/test/CodeGen/PowerPC/eqv-andc-orc-nor.ll
+++ b/test/CodeGen/PowerPC/eqv-andc-orc-nor.ll
@@ -9,66 +9,66 @@
 ; RUN: llc < %s -march=ppc32 | \
 ; RUN:   grep nand | count 1
 
-define i32 @EQV1(i32 %X, i32 %Y) {
+define i32 @EQV1(i32 %X, i32 %Y) nounwind {
 	%A = xor i32 %X, %Y		; <i32> [#uses=1]
 	%B = xor i32 %A, -1		; <i32> [#uses=1]
 	ret i32 %B
 }
 
-define i32 @EQV2(i32 %X, i32 %Y) {
+define i32 @EQV2(i32 %X, i32 %Y) nounwind {
 	%A = xor i32 %X, -1		; <i32> [#uses=1]
 	%B = xor i32 %A, %Y		; <i32> [#uses=1]
 	ret i32 %B
 }
 
-define i32 @EQV3(i32 %X, i32 %Y) {
+define i32 @EQV3(i32 %X, i32 %Y) nounwind {
 	%A = xor i32 %X, -1		; <i32> [#uses=1]
 	%B = xor i32 %Y, %A		; <i32> [#uses=1]
 	ret i32 %B
 }
 
-define i32 @ANDC1(i32 %X, i32 %Y) {
+define i32 @ANDC1(i32 %X, i32 %Y) nounwind {
 	%A = xor i32 %Y, -1		; <i32> [#uses=1]
 	%B = and i32 %X, %A		; <i32> [#uses=1]
 	ret i32 %B
 }
 
-define i32 @ANDC2(i32 %X, i32 %Y) {
+define i32 @ANDC2(i32 %X, i32 %Y) nounwind {
 	%A = xor i32 %X, -1		; <i32> [#uses=1]
 	%B = and i32 %A, %Y		; <i32> [#uses=1]
 	ret i32 %B
 }
 
-define i32 @ORC1(i32 %X, i32 %Y) {
+define i32 @ORC1(i32 %X, i32 %Y) nounwind {
 	%A = xor i32 %Y, -1		; <i32> [#uses=1]
 	%B = or i32 %X, %A		; <i32> [#uses=1]
 	ret i32 %B
 }
 
-define i32 @ORC2(i32 %X, i32 %Y) {
+define i32 @ORC2(i32 %X, i32 %Y) nounwind {
 	%A = xor i32 %X, -1		; <i32> [#uses=1]
 	%B = or i32 %A, %Y		; <i32> [#uses=1]
 	ret i32 %B
 }
 
-define i32 @NOR1(i32 %X) {
+define i32 @NOR1(i32 %X) nounwind {
 	%Y = xor i32 %X, -1		; <i32> [#uses=1]
 	ret i32 %Y
 }
 
-define i32 @NOR2(i32 %X, i32 %Y) {
+define i32 @NOR2(i32 %X, i32 %Y) nounwind {
 	%Z = or i32 %X, %Y		; <i32> [#uses=1]
 	%R = xor i32 %Z, -1		; <i32> [#uses=1]
 	ret i32 %R
 }
 
-define i32 @NAND1(i32 %X, i32 %Y) {
+define i32 @NAND1(i32 %X, i32 %Y) nounwind {
 	%Z = and i32 %X, %Y		; <i32> [#uses=1]
 	%W = xor i32 %Z, -1		; <i32> [#uses=1]
 	ret i32 %W
 }
 
-define void @VNOR(<4 x float>* %P, <4 x float>* %Q) {
+define void @VNOR(<4 x float>* %P, <4 x float>* %Q) nounwind {
 	%tmp = load <4 x float>* %P		; <<4 x float>> [#uses=1]
 	%tmp.upgrd.1 = bitcast <4 x float> %tmp to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%tmp2 = load <4 x float>* %Q		; <<4 x float>> [#uses=1]
@@ -80,7 +80,7 @@ define void @VNOR(<4 x float>* %P, <4 x float>* %Q) {
 	ret void
 }
 
-define void @VANDC(<4 x float>* %P, <4 x float>* %Q) {
+define void @VANDC(<4 x float>* %P, <4 x float>* %Q) nounwind {
 	%tmp = load <4 x float>* %P		; <<4 x float>> [#uses=1]
 	%tmp.upgrd.4 = bitcast <4 x float> %tmp to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%tmp2 = load <4 x float>* %Q		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/PowerPC/tango.net.ftp.FtpClient.ll b/test/CodeGen/PowerPC/tango.net.ftp.FtpClient.ll
index 8a1288afa40c..6f103462664f 100644
--- a/test/CodeGen/PowerPC/tango.net.ftp.FtpClient.ll
+++ b/test/CodeGen/PowerPC/tango.net.ftp.FtpClient.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s
+; RN: llc < %s
+; RUN: false
+; XFAIL: *
 ; PR4534
 
 ; ModuleID = 'tango.net.ftp.FtpClient.bc'
diff --git a/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll b/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll
index 319d29b790e8..9c1fdb32e836 100644
--- a/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+neon -arm-use-neon-fp -relocation-model=pic -disable-fp-elim
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim
 
 	type { %struct.GAP }		; type %0
 	type { i16, i8, i8 }		; type %1
diff --git a/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll b/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll
index a62b61290a5a..317db64ae45c 100644
--- a/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-ScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+neon -arm-use-neon-fp -relocation-model=pic -disable-fp-elim -O3
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim -O3
 
 	type { i16, i8, i8 }		; type %0
 	type { [2 x i32], [2 x i32] }		; type %1
diff --git a/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug.ll b/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug.ll
index 76474746ee48..2bbc231f9622 100644
--- a/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mattr=+neon -arm-use-neon-fp 
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mattr=+neon -arm-use-neon-fp | not grep fcpys
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 | not grep fcpys
 ; rdar://7117307
 
 	%struct.Hosp = type { i32, i32, i32, %struct.List, %struct.List, %struct.List, %struct.List }
diff --git a/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug2.ll b/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug2.ll
index acf562c74a2a..82944847663a 100644
--- a/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug2.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mattr=+neon -arm-use-neon-fp
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8
 ; rdar://7117307
 
 	%struct.Hosp = type { i32, i32, i32, %struct.List, %struct.List, %struct.List, %struct.List }
diff --git a/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug3.ll b/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug3.ll
index 3ada02676bfc..b18c972aeddc 100644
--- a/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug3.ll
+++ b/test/CodeGen/Thumb2/2009-08-04-SubregLoweringBug3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mattr=+neon -arm-use-neon-fp
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8
 ; rdar://7117307
 
 	%struct.Hosp = type { i32, i32, i32, %struct.List, %struct.List, %struct.List, %struct.List }
diff --git a/test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll b/test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll
index 090ed2d81f60..96bcbad77146 100644
--- a/test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 -arm-use-neon-fp
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8
 
 	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
 	%struct.JHUFF_TBL = type { [17 x i8], [256 x i8], i32 }
diff --git a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index 5e7c0a7ee2b7..a2288986362e 100644
--- a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep leaq %t
 ; RUN: not grep {,%rsp)} %t
 ; PR1103
 
diff --git a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
index 113d0eb8647f..c39b82a1fe36 100644
--- a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
+++ b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
@@ -1,12 +1,16 @@
-; RUN: llc < %s -o - -march=x86 -mattr=+mmx | grep paddq | count 2
-; RUN: llc < %s -o - -march=x86 -mattr=+mmx | grep movq | count 2
+; RUN: llc < %s -o - -march=x86 -mattr=+mmx | FileCheck %s
 
-define <1 x i64> @unsigned_add3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) {
+define <1 x i64> @unsigned_add3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
 entry:
 	%tmp2942 = icmp eq i32 %count, 0		; <i1> [#uses=1]
 	br i1 %tmp2942, label %bb31, label %bb26
 
 bb26:		; preds = %bb26, %entry
+
+; CHECK:  movq	({{.*}},8), %mm
+; CHECK:  paddq	({{.*}},8), %mm
+; CHECK:  paddq	%mm{{[0-7]}}, %mm
+
 	%i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]		; <i32> [#uses=3]
 	%sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]		; <<1 x i64>> [#uses=1]
 	%tmp13 = getelementptr <1 x i64>* %b, i32 %i.037.0		; <<1 x i64>*> [#uses=1]
diff --git a/test/CodeGen/X86/2009-02-05-CoalescerBug.ll b/test/CodeGen/X86/2009-02-05-CoalescerBug.ll
index 0ffa8fdc30dd..a46a20b1da65 100644
--- a/test/CodeGen/X86/2009-02-05-CoalescerBug.ll
+++ b/test/CodeGen/X86/2009-02-05-CoalescerBug.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep movss  | count 2
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep movaps | count 4
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 -o %t
+; RUN: grep movss %t | count 2
+; RUN: grep movaps %t | count 2
+; RUN: grep movdqa %t | count 2
 
 define i1 @t([2 x float]* %y, [2 x float]* %w, i32, [2 x float]* %x.pn59, i32 %smax190, i32 %j.1180, <4 x float> %wu.2179, <4 x float> %wr.2178, <4 x float>* %tmp89.out, <4 x float>* %tmp107.out, i32* %indvar.next218.out) nounwind {
 newFuncRoot:
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index a4d642b40354..d9655fd19b3c 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse3 -stats |& not grep {machine-licm}
+; RUN: llc < %s -march=x86-64 -mattr=+sse3 -stats |& grep {2 machine-licm}
+; RUN: llc < %s -march=x86-64 -mattr=+sse3 | FileCheck %s
 ; rdar://6627786
+; rdar://7792037
 
 target triple = "x86_64-apple-darwin10.0"
 	%struct.Key = type { i64 }
@@ -11,6 +13,13 @@ entry:
 	br label %bb4
 
 bb4:		; preds = %bb.i, %bb26, %bb4, %entry
+; CHECK: %bb4
+; CHECK: xorb
+; CHECK: callq
+; CHECK: movq
+; CHECK: xorl
+; CHECK: xorb
+
 	%0 = call i32 (...)* @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
 	%ins = or i64 %p, 2097152		; <i64> [#uses=1]
 	%1 = call i32 (...)* @xxCalculateMidType(%struct.Key* %desc, i32 0) nounwind		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
index 3ce9edb5db44..26bf09c723a0 100644
--- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
+++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://7396984
 
 @str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 0da93bad04e1..686ed9c74dde 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -1,10 +1,17 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | egrep {add|lea} | grep 16
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
 
 	%struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
+                           <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
                            <2 x i64> }
 
 define i32 @main() nounwind  {
 entry:
+; CHECK: main:
+; CHECK: movl $1, (%esp)
+; CHECK: leal 16(%esp), %edi
+; CHECK: movl $36, %ecx
+; CHECK: leal 160(%esp), %esi
+; CHECK: rep;movsl
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
 	%tmp15 = getelementptr %struct.S* %s, i32 0, i32 0		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16
diff --git a/test/CodeGen/X86/coalesce-esp.ll b/test/CodeGen/X86/coalesce-esp.ll
index 0fe4e56c97ca..e0f2796f9dce 100644
--- a/test/CodeGen/X86/coalesce-esp.ll
+++ b/test/CodeGen/X86/coalesce-esp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl	%esp, %eax}
+; RUN: llc < %s | grep {movl	%esp, %ecx}
 ; PR4572
 
 ; Don't coalesce with %esp if it would end up putting %esp in
diff --git a/test/CodeGen/X86/dagcombine-buildvector.ll b/test/CodeGen/X86/dagcombine-buildvector.ll
index c0ee2ac3386b..5cc6eaa405ad 100644
--- a/test/CodeGen/X86/dagcombine-buildvector.ll
+++ b/test/CodeGen/X86/dagcombine-buildvector.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -disable-mmx -o %t
-; RUN: grep unpcklpd %t | count 1
-; RUN: grep movapd %t | count 1
-; RUN: grep movaps %t | count 1
+; RUN: llc < %s -march=x86 -mcpu=penryn -disable-mmx | FileCheck %s
 
 ; Shows a dag combine bug that will generate an illegal build vector
 ; with v2i64 build_vector i32, i32.
 
+; CHECK: test:
+; CHECK: unpcklpd
+; CHECK: movapd
 define void @test(<2 x double>* %dst, <4 x double> %src) nounwind {
 entry:
         %tmp7.i = shufflevector <4 x double> %src, <4 x double> undef, <2 x i32> < i32 0, i32 2 >
@@ -13,6 +13,8 @@ entry:
         ret void
 }
 
+; CHECK: test2:
+; CHECK: movdqa
 define void @test2(<4 x i16>* %src, <4 x i32>* %dest) nounwind {
 entry:
         %tmp1 = load <4 x i16>* %src
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index 3e730de0a8e5..134ee28df6c8 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -5,7 +5,7 @@
 ; bounce the vector off of cache rather than shuffling each individual
 ; element out of the index vector.
 
-; CHECK: pand     (%rdx), %xmm0
+; CHECK: andps    (%rdx), %xmm0
 ; CHECK: movaps   %xmm0, -24(%rsp)
 ; CHECK: movslq   -24(%rsp), %rax
 ; CHECK: movsd    (%rdi,%rax,8), %xmm0
diff --git a/test/CodeGen/X86/licm-symbol.ll b/test/CodeGen/X86/licm-symbol.ll
index d61bbfccbc96..08306c2950e2 100644
--- a/test/CodeGen/X86/licm-symbol.ll
+++ b/test/CodeGen/X86/licm-symbol.ll
@@ -3,7 +3,7 @@
 ; MachineLICM should be able to hoist the sF reference out of the loop.
 
 ; CHECK: pushl %esi
-; CHECK: subl  $8, %esp
+; CHECK: subl  $4, %esp
 ; CHECK: movl  $176, %esi
 ; CHECK: addl  L___sF$non_lazy_ptr, %esi
 ; CHECK: .align  4, 0x90
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index 2dc939e666ff..4fe32d974d05 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -1,15 +1,122 @@
-; RUN: llc < %s -march=x86 -mattr=-sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7
-; RUN: llc < %s -march=x86 -mattr=+sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 5
+; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
+; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
+; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
 
 	%struct.ParmT = type { [25 x i8], i8, i8* }
 @.str12 = internal constant [25 x i8] c"image\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00"		; <[25 x i8]*> [#uses=1]
 
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind 
-
-define void @t(i32 %argc, i8** %argv) nounwind  {
+define void @t1(i32 %argc, i8** %argv) nounwind  {
 entry:
+; SSE2: t1:
+; SSE2: movaps _.str12, %xmm0
+; SSE2: movaps %xmm0
+; SSE2: movb $0
+; SSE2: movl $0
+; SSE2: movl $0
+
+; SSE1: t1:
+; SSE1: movaps _.str12, %xmm0
+; SSE1: movaps %xmm0
+; SSE1: movb $0
+; SSE1: movl $0
+; SSE1: movl $0
+
+; NOSSE: t1:
+; NOSSE: movb $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $101
+; NOSSE: movl $1734438249
+
+; X86-64: t1:
+; X86-64: movaps _.str12(%rip), %xmm0
+; X86-64: movaps %xmm0
+; X86-64: movb $0
+; X86-64: movq $0
 	%parms.i = alloca [13 x %struct.ParmT]		; <[13 x %struct.ParmT]*> [#uses=1]
 	%parms1.i = getelementptr [13 x %struct.ParmT]* %parms.i, i32 0, i32 0, i32 0, i32 0		; <i8*> [#uses=1]
 	call void @llvm.memcpy.i32( i8* %parms1.i, i8* getelementptr ([25 x i8]* @.str12, i32 0, i32 0), i32 25, i32 1 ) nounwind 
 	unreachable
 }
+
+;rdar://7774704
+%struct.s0 = type { [2 x double] }
+
+define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
+entry:
+; SSE2: t2:
+; SSE2: movaps (%eax), %xmm0
+; SSE2: movaps %xmm0, (%eax)
+
+; SSE1: t2:
+; SSE1: movaps (%eax), %xmm0
+; SSE1: movaps %xmm0, (%eax)
+
+; NOSSE: t2:
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+
+; X86-64: t2:
+; X86-64: movaps (%rsi), %xmm0
+; X86-64: movaps %xmm0, (%rdi)
+  %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
+  %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 16)
+  ret void
+}
+
+define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
+entry:
+; SSE2: t3:
+; SSE2: movsd (%eax), %xmm0
+; SSE2: movsd 8(%eax), %xmm1
+; SSE2: movsd %xmm1, 8(%eax)
+; SSE2: movsd %xmm0, (%eax)
+
+; SSE1: t3:
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+
+; NOSSE: t3:
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+
+; X86-64: t3:
+; X86-64: movq (%rsi), %rax
+; X86-64: movq 8(%rsi), %rcx
+; X86-64: movq %rcx, 8(%rdi)
+; X86-64: movq %rax, (%rdi)
+  %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
+  %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8)
+  ret void
+}
+
+declare void @llvm.memcpy.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index 7deb52f8078e..702632cde965 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -1,47 +1,21 @@
-; RUN: llc < %s | not grep rep
-; RUN: llc < %s | grep memset
+; RUN: llc < %s | FileCheck %s
 
 target triple = "i386"
 
 declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
 
-define fastcc i32 @cli_scanzip(i32 %desc) nounwind {
+define fastcc void @t1() nounwind {
 entry:
-	br label %bb8.i.i.i.i
-
-bb8.i.i.i.i:		; preds = %bb8.i.i.i.i, %entry
-	icmp eq i32 0, 0		; <i1>:0 [#uses=1]
-	br i1 %0, label %bb61.i.i.i, label %bb8.i.i.i.i
-
-bb32.i.i.i:		; preds = %bb61.i.i.i
-	ptrtoint i8* %tail.0.i.i.i to i32		; <i32>:1 [#uses=1]
-	sub i32 0, %1		; <i32>:2 [#uses=1]
-	icmp sgt i32 %2, 19		; <i1>:3 [#uses=1]
-	br i1 %3, label %bb34.i.i.i, label %bb61.i.i.i
-
-bb34.i.i.i:		; preds = %bb32.i.i.i
-	load i32* null, align 4		; <i32>:4 [#uses=1]
-	icmp eq i32 %4, 101010256		; <i1>:5 [#uses=1]
-	br i1 %5, label %bb8.i11.i.i.i, label %bb61.i.i.i
-
-bb8.i11.i.i.i:		; preds = %bb8.i11.i.i.i, %bb34.i.i.i
-	icmp eq i32 0, 0		; <i1>:6 [#uses=1]
-	br i1 %6, label %cli_dbgmsg.exit49.i, label %bb8.i11.i.i.i
-
-cli_dbgmsg.exit49.i:		; preds = %bb8.i11.i.i.i
-	icmp eq [32768 x i8]* null, null		; <i1>:7 [#uses=1]
-	br i1 %7, label %bb1.i28.i, label %bb8.i.i
-
-bb61.i.i.i:		; preds = %bb61.i.i.i, %bb34.i.i.i, %bb32.i.i.i, %bb8.i.i.i.i
-	%tail.0.i.i.i = getelementptr [1024 x i8]* null, i32 0, i32 0		; <i8*> [#uses=2]
-	load i8* %tail.0.i.i.i, align 1		; <i8>:8 [#uses=1]
-	icmp eq i8 %8, 80		; <i1>:9 [#uses=1]
-	br i1 %9, label %bb32.i.i.i, label %bb61.i.i.i
-
-bb1.i28.i:		; preds = %cli_dbgmsg.exit49.i
-	call void @llvm.memset.i32( i8* null, i8 0, i32 88, i32 1 ) nounwind
-	unreachable
-
-bb8.i.i:		; preds = %bb8.i.i, %cli_dbgmsg.exit49.i
-	br label %bb8.i.i
+; CHECK: t1:
+; CHECK: call memset
+  call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind
+  unreachable
+}
+
+define fastcc void @t2(i8 signext %c) nounwind {
+entry:
+; CHECK: t2:
+; CHECK: call memset
+  call void @llvm.memset.i32( i8* undef, i8 %c, i32 76, i32 1 ) nounwind
+  unreachable
 }
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index da8fc51da8e1..c0cd271d985e 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep stosl
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movq | count 10
+; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=nehalem | grep movaps | count 5
+; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=core2   | grep movl   | count 20
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2   | grep movq   | count 10
 
 define void @bork() nounwind {
 entry:
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index e997233a5194..35b172509c95 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -194,10 +194,10 @@ bb12:
 
 ; LINUX: .LJTI8_0:
 ; LINUX:   .long	 .LBB8_2@GOTOFF
-; LINUX:   .long	 .LBB8_2@GOTOFF
-; LINUX:   .long	 .LBB8_7@GOTOFF
-; LINUX:   .long	 .LBB8_3@GOTOFF
-; LINUX:   .long	 .LBB8_7@GOTOFF
+; LINUX:   .long	 .LBB8_8@GOTOFF
+; LINUX:   .long	 .LBB8_14@GOTOFF
+; LINUX:   .long	 .LBB8_9@GOTOFF
+; LINUX:   .long	 .LBB8_10@GOTOFF
 }
 
 declare void @foo1(...)
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index e2746a8c0638..bf5229aa1ee8 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 12
+; RUN: grep mov %t | count 11
 
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
diff --git a/test/CodeGen/X86/pmulld.ll b/test/CodeGen/X86/pmulld.ll
new file mode 100644
index 000000000000..3ef594112b45
--- /dev/null
+++ b/test/CodeGen/X86/pmulld.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse41 -asm-verbose=0 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK: test1:
+; CHECK-NEXT: pmulld
+  %C = mul <4 x i32> %A, %B
+  ret <4 x i32> %C
+}
+
+define <4 x i32> @test1a(<4 x i32> %A, <4 x i32> *%Bp) nounwind {
+; CHECK: test1a:
+; CHECK-NEXT: pmulld
+  %B = load <4 x i32>* %Bp
+  %C = mul <4 x i32> %A, %B
+  ret <4 x i32> %C
+}
diff --git a/test/CodeGen/X86/postalloc-coalescing.ll b/test/CodeGen/X86/postalloc-coalescing.ll
index a171436543c6..fe6f521f4d32 100644
--- a/test/CodeGen/X86/postalloc-coalescing.ll
+++ b/test/CodeGen/X86/postalloc-coalescing.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 | grep mov | count 3
 
-define fastcc i32 @_Z18yy_get_next_bufferv() {
+define fastcc i32 @_Z18yy_get_next_bufferv() nounwind {
 entry:
 	br label %bb131
 
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll
index 0760e4c7fd5b..27047dfdfd8c 100644
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 | grep movl | count 5
+; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 | FileCheck %s
 ; PR2659
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
@@ -13,6 +14,11 @@ forcond.preheader:              ; preds = %entry
   %cmp44 = icmp eq i32 %k, 0            ; <i1> [#uses=1]
   br i1 %cmp44, label %afterfor, label %forbody
 
+; CHECK: %forcond.preheader.forbody_crit_edge
+; CHECK: movl $1
+; CHECK-NOT: xorl
+; CHECK-NEXT: movl $1
+
 ifthen:         ; preds = %entry
   ret i32 0
 
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index ce35b4545184..8e52a7cbfe78 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -271,3 +271,45 @@ entry:
 }
 
 declare double @bar4()
+
+; rdar://6283267
+define void @t17() nounwind ssp {
+entry:
+; 32: t17:
+; 32: jmp {{_?}}bar5
+
+; 64: t17:
+; 64: xorb %al, %al
+; 64: jmp {{_?}}bar5
+  tail call void (...)* @bar5() nounwind
+  ret void
+}
+
+declare void @bar5(...)
+
+; rdar://7774847
+define void @t18() nounwind ssp {
+entry:
+; 32: t18:
+; 32: call {{_?}}bar6
+; 32: fstp %st(0)
+
+; 64: t18:
+; 64: xorb %al, %al
+; 64: jmp {{_?}}bar6
+  %0 = tail call double (...)* @bar6() nounwind
+  ret void
+}
+
+declare double @bar6(...)
+
+define void @t19() alignstack(32) nounwind {
+entry:
+; CHECK: t19:
+; CHECK: andl $-32
+; CHECK: call {{_?}}foo
+  tail call void @foo() nounwind
+  ret void
+}
+
+declare void @foo()
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index 9ec9182e5e3c..1b596b589899 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s | not grep movs
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin8"
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2   | grep movsd  | count 8
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
 
 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 4f025b916fd9..118e393b7baa 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,10 +1,8 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep unpck %t | count 2
-; RUN: grep shuf %t | count 2
-; RUN: grep ps %t | count 4
-; RUN: grep pd %t | count 4
-; RUN: grep movup %t | count 4
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
+; CHECK: a:
+; CHECK: movdqu
+; CHECK: pshufd
 define <4 x float> @a(<4 x float>* %y) nounwind {
   %x = load <4 x float>* %y, align 4
   %a = extractelement <4 x float> %x, i32 0
@@ -17,6 +15,10 @@ define <4 x float> @a(<4 x float>* %y) nounwind {
   %s = insertelement <4 x float> %r, float %a, i32 3
   ret <4 x float> %s
 }
+
+; CHECK: b:
+; CHECK: movups
+; CHECK: unpckhps
 define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
   %x = load <4 x float>* %y, align 4
   %a = extractelement <4 x float> %x, i32 2
@@ -29,6 +31,10 @@ define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
   %s = insertelement <4 x float> %r, float %b, i32 3
   ret <4 x float> %s
 }
+
+; CHECK: c:
+; CHECK: movupd
+; CHECK: shufpd
 define <2 x double> @c(<2 x double>* %y) nounwind {
   %x = load <2 x double>* %y, align 8
   %a = extractelement <2 x double> %x, i32 0
@@ -37,6 +43,10 @@ define <2 x double> @c(<2 x double>* %y) nounwind {
   %r = insertelement <2 x double> %p, double %a, i32 1
   ret <2 x double> %r
 }
+
+; CHECK: d:
+; CHECK: movupd
+; CHECK: unpckhpd
 define <2 x double> @d(<2 x double>* %y, <2 x double> %z) nounwind {
   %x = load <2 x double>* %y, align 8
   %a = extractelement <2 x double> %x, i32 1
diff --git a/test/CodeGen/X86/sse-align-6.ll b/test/CodeGen/X86/sse-align-6.ll
index 0bbf4228a40b..fcea1b102a20 100644
--- a/test/CodeGen/X86/sse-align-6.ll
+++ b/test/CodeGen/X86/sse-align-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movups | count 1
+; RUN: llc < %s -march=x86-64 | grep movdqu | count 1
 
 define <2 x i64> @bar(<2 x i64>* %p, <2 x i64> %x) nounwind {
   %t = load <2 x i64>* %p, align 8
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index e9c2c01a9e44..b969ecb41420 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -20,7 +20,7 @@ entry:
 ; X64:  pshuflw	$0, %xmm0, %xmm0
 ; X64:	xorl	%eax, %eax
 ; X64:	pinsrw	$0, %eax, %xmm0
-; X64:	movaps	%xmm0, (%rdi)
+; X64:	movdqa	%xmm0, (%rdi)
 ; X64:	ret
 }
 
@@ -32,7 +32,7 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
         
 ; X64: t1:
 ; X64: 	movl	(%rsi), %eax
-; X64: 	movaps	(%rdi), %xmm0
+; X64: 	movdqa	(%rdi), %xmm0
 ; X64: 	pinsrw	$0, %eax, %xmm0
 ; X64: 	ret
 }
@@ -66,7 +66,7 @@ define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64: 	pshufhw	$100, %xmm0, %xmm2
 ; X64: 	pinsrw	$1, %eax, %xmm2
 ; X64: 	pextrw	$1, %xmm0, %eax
-; X64: 	movaps	%xmm2, %xmm0
+; X64: 	movdqa	%xmm2, %xmm0
 ; X64: 	pinsrw	$4, %eax, %xmm0
 ; X64: 	ret
 }
@@ -122,7 +122,7 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
 ; X64: 	t8:
 ; X64: 		pshuflw	$-58, (%rsi), %xmm0
 ; X64: 		pshufhw	$-58, %xmm0, %xmm0
-; X64: 		movaps	%xmm0, (%rdi)
+; X64: 		movdqa	%xmm0, (%rdi)
 ; X64: 		ret
 }
 
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll
index b61803d10ce4..47b7896bb8f1 100644
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0   | FileCheck -check-prefix=I386 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
 
 @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
@@ -11,7 +13,15 @@ entry:
 bb:
   %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0
   call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1)
-; CHECK: movups _.str3
+; I386: movsd _.str3+16
+; I386: movsd _.str3+8
+; I386: movsd _.str3
+
+; CORE2: movabsq
+; CORE2: movabsq
+; CORE2: movabsq
+
+; COREI7: movups _.str3
   br label %bb
 
 return:
@@ -20,8 +30,8 @@ return:
 
 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 
-; CHECK: .align  3
-; CHECK-NEXT: _.str1:
-; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CHECK: .align 3
-; CHECK-NEXT: _.str3:
+; CORE2: .align  3
+; CORE2-NEXT: _.str1:
+; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
+; CORE2: .align 3
+; CORE2-NEXT: _.str3:
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index c8c7257cbb9c..39c9b770d5f4 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -15,7 +15,7 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK: test2:
 ; CHECK: pcmp
 ; CHECK: pcmp
-; CHECK: xorps
+; CHECK: pxor
 ; CHECK: ret
 	%C = icmp sge <4 x i32> %A, %B
         %D = sext <4 x i1> %C to <4 x i32>
@@ -25,7 +25,7 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
 define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK: test3:
 ; CHECK: pcmpgtd
-; CHECK: movaps
+; CHECK: movdqa
 ; CHECK: ret
 	%C = icmp slt <4 x i32> %A, %B
         %D = sext <4 x i1> %C to <4 x i32>
@@ -34,7 +34,7 @@ define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
 
 define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK: test4:
-; CHECK: movaps
+; CHECK: movdqa
 ; CHECK: pcmpgtd
 ; CHECK: ret
 	%C = icmp ugt <4 x i32> %A, %B
diff --git a/test/CodeGen/X86/vec_insert_4.ll b/test/CodeGen/X86/vec_insert-4.ll
similarity index 100%
rename from test/CodeGen/X86/vec_insert_4.ll
rename to test/CodeGen/X86/vec_insert-4.ll
diff --git a/test/CodeGen/X86/vec_insert-9.ll b/test/CodeGen/X86/vec_insert-9.ll
new file mode 100644
index 000000000000..2e829df1f8df
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-9.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86 -mattr=+sse41 > %t
+; RUN: grep pinsrd %t | count 2
+
+define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind  {
+entry:
+	%tmp3 = insertelement <4 x i32> undef, i32 %val, i32 0		; <<4 x i32>> [#uses=1]
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %idx, i32 3		; <<4 x i32>> [#uses=1]
+	ret <4 x i32> %tmp4
+}
diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll
index 66762b4a0604..676be9b7179c 100644
--- a/test/CodeGen/X86/vec_return.ll
+++ b/test/CodeGen/X86/vec_return.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep xorps %t | count 1
+; RUN: grep pxor %t | count 1
 ; RUN: grep movaps %t | count 1
 ; RUN: not grep shuf %t
 
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index c316df887c16..7f5f8dd213a5 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep punpckl | count 7
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep punpckl | count 7
 
 define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
         %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0          ; <<8 x i16>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_shuffle-7.ll b/test/CodeGen/X86/vec_shuffle-7.ll
index 4cdca09c72f5..64bd6a3c83b8 100644
--- a/test/CodeGen/X86/vec_shuffle-7.ll
+++ b/test/CodeGen/X86/vec_shuffle-7.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep xorps %t | count 1
+; RUN: grep pxor %t | count 1
 ; RUN: not grep shufps %t
 
 define void @test() {
diff --git a/test/CodeGen/X86/vec_shuffle-9.ll b/test/CodeGen/X86/vec_shuffle-9.ll
index fc16a26b6154..07195869b8cf 100644
--- a/test/CodeGen/X86/vec_shuffle-9.ll
+++ b/test/CodeGen/X86/vec_shuffle-9.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
 define <4 x i32> @test(i8** %ptr) {
-; CHECK: xorps
+; CHECK: pxor
 ; CHECK: punpcklbw
 ; CHECK: punpcklwd
 
diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll
index c05b79a54a15..2a48de22098f 100644
--- a/test/CodeGen/X86/vec_shuffle.ll
+++ b/test/CodeGen/X86/vec_shuffle.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
-; RUN: grep shufp   %t | count 1
+; RUN: grep movq    %t | count 1
+; RUN: grep pshufd  %t | count 1
 ; RUN: grep movupd  %t | count 1
 ; RUN: grep pshufhw %t | count 1
 
diff --git a/test/CodeGen/X86/vec_zero.ll b/test/CodeGen/X86/vec_zero.ll
index ae5af586cdc3..4d1f05629b41 100644
--- a/test/CodeGen/X86/vec_zero.ll
+++ b/test/CodeGen/X86/vec_zero.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep xorps | count 2
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
+; CHECK: xorps
 define void @foo(<4 x float>* %P) {
         %T = load <4 x float>* %P               ; <<4 x float>> [#uses=1]
         %S = fadd <4 x float> zeroinitializer, %T                ; <<4 x float>> [#uses=1]
@@ -7,6 +8,7 @@ define void @foo(<4 x float>* %P) {
         ret void
 }
 
+; CHECK: pxor
 define void @bar(<4 x i32>* %P) {
         %T = load <4 x i32>* %P         ; <<4 x i32>> [#uses=1]
         %S = add <4 x i32> zeroinitializer, %T          ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll
index 296378c6e9f5..3b15d4cc407b 100644
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 1
-; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep xorps | count 1
+; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 2
 ; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pcmpeqd | count 2
 
 @M1 = external global <1 x i64>
diff --git a/test/CodeGen/X86/widen_arith-5.ll b/test/CodeGen/X86/widen_arith-5.ll
index f7f340873624..bae5c54eea64 100644
--- a/test/CodeGen/X86/widen_arith-5.ll
+++ b/test/CodeGen/X86/widen_arith-5.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 -mattr=+sse42 -disable-mmx  | FileCheck %s
-; CHECK: movaps
+; CHECK: movdqa
 ; CHECK: pmulld
 ; CHECK: psubd
 
diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll
index 1e626a2f8822..14e8f7562482 100644
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@@ -2,7 +2,7 @@
 ; CHECK: pextrd
 ; CHECK: pextrd
 ; CHECK: movd
-; CHECK: movaps
+; CHECK: movdqa
 
 
 ; bitcast v14i16 to v7i32
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 58b557a7915c..658c05143e84 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -5,7 +5,7 @@
 
 %i32vec3 = type <3 x i32>
 define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
-; CHECK: movaps
+; CHECK: movdqa
 ; CHECK: paddd
 ; CHECK: pextrd
 ; CHECK: movq
@@ -33,13 +33,13 @@ define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
 
 %i32vec7 = type <7 x i32>
 define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
-; CHECK: movaps
-; CHECK: movaps
+; CHECK: movdqa
+; CHECK: movdqa
 ; CHECK: paddd
 ; CHECK: paddd
 ; CHECK: pextrd
 ; CHECK: movq
-; CHECK: movaps
+; CHECK: movdqa
 	%a = load %i32vec7* %ap, align 16
 	%b = load %i32vec7* %bp, align 16
 	%x = add %i32vec7 %a, %b
@@ -49,15 +49,15 @@ define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
 
 %i32vec12 = type <12 x i32>
 define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
-; CHECK: movaps
-; CHECK: movaps
-; CHECK: movaps
+; CHECK: movdqa
+; CHECK: movdqa
+; CHECK: movdqa
 ; CHECK: paddd
 ; CHECK: paddd
 ; CHECK: paddd
-; CHECK: movaps
-; CHECK: movaps
-; CHECK: movaps
+; CHECK: movdqa
+; CHECK: movdqa
+; CHECK: movdqa
 	%a = load %i32vec12* %ap, align 16
 	%b = load %i32vec12* %bp, align 16
 	%x = add %i32vec12 %a, %b
@@ -68,7 +68,7 @@ define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
 
 %i16vec3 = type <3 x i16>
 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
-; CHECK: movaps
+; CHECK: movdqa
 ; CHECK: paddw
 ; CHECK: movd
 ; CHECK: pextrw
@@ -81,7 +81,7 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
 
 %i16vec4 = type <4 x i16>
 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
-; CHECK: movaps
+; CHECK: movdqa
 ; CHECK: paddw
 ; CHECK: movq
 	%a = load %i16vec4* %ap, align 16
@@ -93,12 +93,12 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp
 
 %i16vec12 = type <12 x i16>
 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
-; CHECK: movaps
-; CHECK: movaps
+; CHECK: movdqa
+; CHECK: movdqa
 ; CHECK: paddw
 ; CHECK: paddw
 ; CHECK: movq
-; CHECK: movaps
+; CHECK: movdqa
 	%a = load %i16vec12* %ap, align 16
 	%b = load %i16vec12* %bp, align 16
 	%x = add %i16vec12 %a, %b
@@ -108,15 +108,15 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
 
 %i16vec18 = type <18 x i16>
 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
-; CHECK: movaps
-; CHECK: movaps
-; CHECK: movaps
+; CHECK: movdqa
+; CHECK: movdqa
+; CHECK: movdqa
 ; CHECK: paddw
 ; CHECK: paddw
 ; CHECK: paddw
 ; CHECK: movd
-; CHECK: movaps
-; CHECK: movaps
+; CHECK: movdqa
+; CHECK: movdqa
 	%a = load %i16vec18* %ap, align 16
 	%b = load %i16vec18* %bp, align 16
 	%x = add %i16vec18 %a, %b
@@ -127,7 +127,7 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
 
 %i8vec3 = type <3 x i8>
 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
-; CHECK: movaps
+; CHECK: movdqa
 ; CHECK: paddb
 ; CHECK: pextrb
 ; CHECK: movb
@@ -140,8 +140,8 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
 
 %i8vec31 = type <31 x i8>
 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
-; CHECK: movaps
-; CHECK: movaps
+; CHECK: movdqa
+; CHECK: movdqa
 ; CHECK: paddb
 ; CHECK: paddb
 ; CHECK: movq
diff --git a/test/CodeGen/X86/xor-icmp.ll b/test/CodeGen/X86/xor-icmp.ll
index 2d75c5d76201..34875ed89957 100644
--- a/test/CodeGen/X86/xor-icmp.ll
+++ b/test/CodeGen/X86/xor-icmp.ll
@@ -43,7 +43,7 @@ define i32 @t2(i32 %x, i32 %y) nounwind ssp {
 ; X32: cmpl
 ; X32: sete
 ; X32-NOT: xor
-; X32: je
+; X32: jne
 
 ; X64: t2:
 ; X64: testl
@@ -51,7 +51,7 @@ define i32 @t2(i32 %x, i32 %y) nounwind ssp {
 ; X64: testl
 ; X64: sete
 ; X64-NOT: xor
-; X64: je
+; X64: jne
 entry:
   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
   %1 = icmp eq i32 %y, 0                          ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index 9bfff8a06a86..f270d9d56e21 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -7,7 +7,7 @@ define <4 x i32> @test1() nounwind {
 	ret <4 x i32> %tmp
         
 ; X32: test1:
-; X32:	xorps	%xmm0, %xmm0
+; X32:	pxor	%xmm0, %xmm0
 ; X32:	ret
 }
 
diff --git a/test/DebugInfo/2009-11-03-InsertExtractValue.ll b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
index d9a67d64b62e..8782e4446f4b 100644
--- a/test/DebugInfo/2009-11-03-InsertExtractValue.ll
+++ b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
@@ -3,9 +3,9 @@
 !0 = metadata !{i32 42}
 
 define <{i32, i32}> @f1() {
-; CHECK: !dbg !0
-  %r = insertvalue <{ i32, i32 }> zeroinitializer, i32 4, 1, !dbg !0
-; CHECK: !dbg !0
-  %e = extractvalue <{ i32, i32 }> %r, 0, !dbg !0
+; CHECK: !dbgx !0
+  %r = insertvalue <{ i32, i32 }> zeroinitializer, i32 4, 1, !dbgx !0
+; CHECK: !dbgx !0
+  %e = extractvalue <{ i32, i32 }> %r, 0, !dbgx !0
   ret <{ i32, i32 }> %r
 }
diff --git a/test/DebugInfo/2010-03-22-CU-HighLow.ll b/test/DebugInfo/2010-03-22-CU-HighLow.ll
new file mode 100644
index 000000000000..42c25e4711cc
--- /dev/null
+++ b/test/DebugInfo/2010-03-22-CU-HighLow.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s | grep low_pc | count 2
+@i = global i32 1                                 ; <i32*> [#uses=0]
+
+!llvm.dbg.gv = !{!0}
+
+!0 = metadata !{i32 524340, i32 0, metadata !1, metadata !"i", metadata !"i", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32* @i} ; [ DW_TAG_variable ]
+!1 = metadata !{i32 524329, metadata !"b.c", metadata !"/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 524305, i32 0, i32 1, metadata !"b.c", metadata !"/tmp", metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 524324, metadata !1, metadata !"int", metadata !1, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/DebugInfo/2010-03-24-MemberFn.ll b/test/DebugInfo/2010-03-24-MemberFn.ll
new file mode 100644
index 000000000000..20c0b8ee009f
--- /dev/null
+++ b/test/DebugInfo/2010-03-24-MemberFn.ll
@@ -0,0 +1,62 @@
+; RUN: llc -O0 < %s | grep AT_decl_file |  grep 2
+; Here _ZN1S3fooEv is defined in header file identified as AT_decl_file no. 2 in debug info.
+%struct.S = type <{ i8 }>
+
+define i32 @_Z3barv() nounwind ssp {
+entry:
+  %retval = alloca i32                            ; <i32*> [#uses=2]
+  %0 = alloca i32                                 ; <i32*> [#uses=2]
+  %s1 = alloca %struct.S                          ; <%struct.S*> [#uses=1]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  call void @llvm.dbg.declare(metadata !{%struct.S* %s1}, metadata !0), !dbg !16
+  %1 = call i32 @_ZN1S3fooEv(%struct.S* %s1) nounwind, !dbg !17 ; <i32> [#uses=1]
+  store i32 %1, i32* %0, align 4, !dbg !17
+  %2 = load i32* %0, align 4, !dbg !17            ; <i32> [#uses=1]
+  store i32 %2, i32* %retval, align 4, !dbg !17
+  br label %return, !dbg !17
+
+return:                                           ; preds = %entry
+  %retval1 = load i32* %retval, !dbg !17          ; <i32> [#uses=1]
+  ret i32 %retval1, !dbg !16
+}
+
+define linkonce_odr i32 @_ZN1S3fooEv(%struct.S* %this) nounwind ssp align 2 {
+entry:
+  %this_addr = alloca %struct.S*                  ; <%struct.S**> [#uses=1]
+  %retval = alloca i32                            ; <i32*> [#uses=1]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  call void @llvm.dbg.declare(metadata !{%struct.S** %this_addr}, metadata !18), !dbg !21
+  store %struct.S* %this, %struct.S** %this_addr
+  br label %return, !dbg !21
+
+return:                                           ; preds = %entry
+  %retval1 = load i32* %retval, !dbg !21          ; <i32> [#uses=1]
+  ret i32 %retval1, !dbg !22
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!0 = metadata !{i32 524544, metadata !1, metadata !"s1", metadata !4, i32 3, metadata !9} ; [ DW_TAG_auto_variable ]
+!1 = metadata !{i32 524299, metadata !2, i32 3, i32 0} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{i32 524299, metadata !3, i32 3, i32 0} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{i32 524334, i32 0, metadata !4, metadata !"bar", metadata !"bar", metadata !"_Z3barv", metadata !4, i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false} ; [ DW_TAG_subprogram ]
+!4 = metadata !{i32 524329, metadata !"one.cc", metadata !"/tmp/", metadata !5} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 524305, i32 0, i32 4, metadata !"one.cc", metadata !"/tmp/", metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{i32 524309, metadata !4, metadata !"", metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 524324, metadata !4, metadata !"int", metadata !4, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{i32 524307, metadata !4, metadata !"S", metadata !10, i32 2, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
+!10 = metadata !{i32 524329, metadata !"one.h", metadata !"/tmp/", metadata !5} ; [ DW_TAG_file_type ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 524334, i32 0, metadata !9, metadata !"foo", metadata !"foo", metadata !"_ZN1S3fooEv", metadata !10, i32 3, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i1 false} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 524309, metadata !4, metadata !"", metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!14 = metadata !{metadata !8, metadata !15}
+!15 = metadata !{i32 524303, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !9} ; [ DW_TAG_pointer_type ]
+!16 = metadata !{i32 3, i32 0, metadata !1, null}
+!17 = metadata !{i32 3, i32 0, metadata !3, null}
+!18 = metadata !{i32 524545, metadata !12, metadata !"this", metadata !10, i32 3, metadata !19} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{i32 524326, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_const_type ]
+!20 = metadata !{i32 524303, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
+!21 = metadata !{i32 3, i32 0, metadata !12, null}
+!22 = metadata !{i32 3, i32 0, metadata !23, null}
+!23 = metadata !{i32 524299, metadata !12, i32 3, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
new file mode 100644
index 000000000000..9bb35fab4fee
--- /dev/null
+++ b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -o /dev/null
+
+define void @baz(i32 %i) nounwind ssp {
+entry:
+  call void @llvm.dbg.declare(metadata !0, metadata !1), !dbg !0
+  ret void, !dbg !0
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!0 = metadata !{{ [0 x i8] }** undef}
+!1 = metadata !{i32 524544, metadata !2, metadata !"x", metadata !4, i32 11, metadata !9} ; [ DW_TAG_auto_variable ]
+!2 = metadata !{i32 524299, metadata !3, i32 8, i32 0} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{i32 524334, i32 0, metadata !4, metadata !"baz", metadata !"baz", metadata !"baz", metadata !4, i32 8, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i1 false} ; [ DW_TAG_subprogram ]
+!4 = metadata !{i32 524329, metadata !"2007-12-VarArrayDebug.c", metadata !"/Users/sabre/llvm/test/FrontendC/", metadata !5} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 524305, i32 0, i32 1, metadata !"2007-12-VarArrayDebug.c", metadata !"/Users/sabre/llvm/test/FrontendC/", metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{i32 524309, metadata !4, metadata !"", metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{i32 524324, metadata !4, metadata !"int", metadata !4, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{i32 524303, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{i32 524307, metadata !3, metadata !"", metadata !4, i32 11, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 524301, metadata !10, metadata !"b", metadata !4, i32 11, i64 8, i64 8, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ]
+!13 = metadata !{i32 524310, metadata !3, metadata !"A", metadata !4, i32 11, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ]
+!14 = metadata !{i32 524289, metadata !4, metadata !"", metadata !4, i32 0, i64 8, i64 8, i64 0, i32 0, metadata !15, metadata !16, i32 0, null} ; [ DW_TAG_array_type ]
+!15 = metadata !{i32 524324, metadata !4, metadata !"char", metadata !4, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 524321, i64 0, i64 0}        ; [ DW_TAG_subrange_type ]
+!18 = metadata !{metadata !"llvm.mdnode.fwdref.19"}
+!19 = metadata !{metadata !"llvm.mdnode.fwdref.23"}
diff --git a/test/Feature/unions.ll b/test/Feature/unions.ll
index 9d6c36bb3c53..3cf8c3ce0e97 100644
--- a/test/Feature/unions.ll
+++ b/test/Feature/unions.ll
@@ -6,7 +6,9 @@
 
 @union1 = constant union { i32, i8 } { i32 4 }
 @union2 = constant union { i32, i8 } insertvalue(union { i32, i8 } undef, i32 4, 0)
+@union3 = common global %union.anon zeroinitializer, align 8 
 
 define void @"Unions" () {
   ret void
 }
+
diff --git a/test/FrontendC++/2010-03-22-empty-baseclass.cpp b/test/FrontendC++/2010-03-22-empty-baseclass.cpp
new file mode 100644
index 000000000000..b6bdea40c3f2
--- /dev/null
+++ b/test/FrontendC++/2010-03-22-empty-baseclass.cpp
@@ -0,0 +1,134 @@
+// RUN: %llvmgxx -S -emit-llvm %s -o - -O2 | FileCheck %s
+namespace boost {
+  namespace detail {
+    template <typename T> struct cv_traits_imp {};
+    template <typename T> struct cv_traits_imp<T*> {typedef T unqualified_type;};
+  }
+}
+namespace mpl_ {}
+namespace boost {
+  namespace mpl {using namespace mpl_;}
+  template< typename T > struct remove_cv {typedef typename boost::detail::cv_traits_imp<T*>::unqualified_type type;};
+  namespace type_traits {
+    typedef char yes_type;
+    struct no_type {char padding[8];};
+  }
+}
+namespace mpl_ {
+  template< bool C_ > struct bool_;
+  typedef bool_<true> true_;
+  typedef bool_<false> false_;
+  template< bool C_ > struct bool_ {static const bool value = C_;};
+  template< typename T, T N > struct integral_c;
+}
+namespace boost{
+  template <class T, T val>   struct integral_constant :
+    public mpl::integral_c<T, val> {};
+  template<> struct integral_constant<bool,true> : public mpl::true_ {};
+  template<> struct integral_constant<bool,false> : public mpl::false_ {};
+  namespace type_traits {
+    template <bool b1, bool b2, bool b3 = false, bool b4 = false,
+              bool b5 = false, bool b6 = false, bool b7 = false> struct ice_or;
+    template <bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7>
+    struct ice_or {static const bool value = true; };
+    template <> struct ice_or<false, false, false, false, false, false, false>
+    {static const bool value = false;};
+    template <bool b1, bool b2, bool b3 = true, bool b4 = true, bool b5 = true,
+              bool b6 = true, bool b7 = true> struct ice_and;
+    template <bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7>
+    struct ice_and {static const bool value = false;};
+    template <> struct ice_and<true, true, true, true, true, true, true>
+    {static const bool value = true;};
+    template <bool b> struct ice_not {static const bool value = true;};
+  };
+  namespace detail {
+    template <typename T> struct is_union_impl {static const bool value = false;};
+  }
+  template< typename T > struct is_union :
+  ::boost::integral_constant<bool, ::boost::detail::is_union_impl<T>::value> {};
+  namespace detail {
+    template <class U> ::boost::type_traits::yes_type is_class_tester(void(U::*)(void));
+    template <class U> ::boost::type_traits::no_type is_class_tester(...);
+    template <typename T> struct is_class_impl {
+      static const bool value = (::boost::type_traits::ice_and< sizeof(is_class_tester<T>(0))
+                                 == sizeof(::boost::type_traits::yes_type),
+                                 ::boost::type_traits::ice_not< ::boost::is_union<T>::value >::value >::value);};
+}
+  template<typename T> struct is_class:
+  ::boost::integral_constant<bool,::boost::detail::is_class_impl<T>::value> {  };
+namespace detail {
+  template <typename T> struct empty_helper_t1: public T {int i[256];};
+  struct empty_helper_t2 {int i[256];};
+  template <typename T, bool is_a_class = false> struct empty_helper
+  {static const bool value = false;};
+  template <typename T> struct empty_helper<T, true>
+  {static const bool value = (sizeof(empty_helper_t1<T>) == sizeof(empty_helper_t2));};
+  template <typename T> struct is_empty_impl {
+    typedef typename remove_cv<T>::type cvt;
+    static const bool value = (::boost::type_traits::ice_or< ::boost::detail::empty_helper
+                               <cvt,::boost::is_class<T>::value>::value, false>::value);
+  };
+}
+template<typename T> struct is_empty:
+::boost::integral_constant<bool,::boost::detail::is_empty_impl<T>::value> {};
+template<typename T, typename U > struct is_same:
+::boost::integral_constant<bool,false> {};
+template<typename T> struct call_traits {typedef T& reference;};
+namespace details {
+  template <class T1, class T2, bool IsSame, bool FirstEmpty, bool SecondEmpty>
+  struct compressed_pair_switch;
+  template <class T1, class T2>
+  struct compressed_pair_switch<T1, T2, false, true, false>
+  {static const int value = 1;};
+  template <class T1, class T2, int Version> class compressed_pair_imp;
+  template <class T1, class T2> class compressed_pair_imp<T1, T2, 1>:
+  protected ::boost::remove_cv<T1>::type {
+  public:
+    typedef T1 first_type;
+    typedef T2 second_type;
+    typedef typename call_traits<first_type>::reference first_reference;
+    typedef typename call_traits<second_type>::reference second_reference;
+    first_reference first() {return *this;}
+    second_reference second() {return second_;}
+    second_type second_;
+  };
+}
+template <class T1, class T2> class compressed_pair:
+  private ::boost::details::compressed_pair_imp<T1, T2, ::boost::details::compressed_pair_switch<
+                                                          T1, T2, ::boost::is_same<typename remove_cv<T1>::type,
+                                                                                   typename remove_cv<T2>::type>::value,
+                                                          ::boost::is_empty<T1>::value, ::boost::is_empty<T2>::value>::value>
+  {
+  private:
+    typedef details::compressed_pair_imp<T1, T2, ::boost::details::compressed_pair_switch<
+                                                   T1, T2, ::boost::is_same<typename remove_cv<T1>::type,
+                                                                            typename remove_cv<T2>::type>::value,
+                                                                              ::boost::is_empty<T1>::value, ::boost::is_empty<T2>::value>::value> base;
+  public:
+    typedef T1 first_type;
+    typedef T2 second_type;
+    typedef typename call_traits<first_type>::reference first_reference;
+    typedef typename call_traits<second_type>::reference second_reference;
+    first_reference first() {return base::first();}
+    second_reference second() {return base::second();}
+  };
+}
+struct empty_base_t {};
+struct empty_t : empty_base_t {};
+typedef boost::compressed_pair<empty_t, int> data_t;
+extern "C" {int printf(const char * , ...);}
+extern "C" {void abort(void);}
+int main (int argc, char * const argv[]) {
+  data_t x;
+  x.second() = -3;
+  // This store should be elided:
+  x.first() = empty_t();
+  // If x.second() has been clobbered by the elided store, fail.
+  if (x.second() != -3) {
+    printf("x.second() was clobbered\n");
+    // CHECK-NOT: x.second() was clobbered
+    abort();
+  }
+  return 0;
+}
+// CHECK: ret i32
diff --git a/test/FrontendObjC/2010-03-17-StructRef.m b/test/FrontendObjC/2010-03-17-StructRef.m
index a8a509c2e98a..3594684a194d 100644
--- a/test/FrontendObjC/2010-03-17-StructRef.m
+++ b/test/FrontendObjC/2010-03-17-StructRef.m
@@ -1,4 +1,4 @@
-// RUN: %llvmgcc %s -S -o - | FileCheck %s
+// RUN: %llvmgcc %s -m64 -S -o - | FileCheck %s
 // Bitfield references must not touch memory outside of the enclosing
 // struct.   Radar 7639995
 typedef signed char BOOL;
diff --git a/test/MC/AsmParser/X86/x86_32-bit_cat.s b/test/MC/AsmParser/X86/x86_32-bit_cat.s
index e910c653e96c..ec2bfa4a913d 100644
--- a/test/MC/AsmParser/X86/x86_32-bit_cat.s
+++ b/test/MC/AsmParser/X86/x86_32-bit_cat.s
@@ -7806,3 +7806,39 @@
 
 // CHECK: 	pcmpgtq	%xmm5, %xmm5
         	pcmpgtq	%xmm5,%xmm5
+
+// CHECK: 	aesimc	%xmm0, %xmm1
+                aesimc %xmm0,%xmm1
+
+// CHECK: 	aesimc	(%eax), %xmm1
+                aesimc (%eax),%xmm1
+
+// CHECK: 	aesenc	%xmm1, %xmm2
+                aesenc %xmm1,%xmm2
+
+// CHECK: 	aesenc	4(%ebx), %xmm2
+                aesenc 4(%ebx),%xmm2
+
+// CHECK: 	aesenclast	%xmm3, %xmm4
+                aesenclast %xmm3,%xmm4
+
+// CHECK: 	aesenclast	4(%edx,%edi), %xmm4
+                aesenclast 4(%edx,%edi),%xmm4
+
+// CHECK: 	aesdec	%xmm5, %xmm6
+                aesdec %xmm5,%xmm6
+
+// CHECK: 	aesdec	4(%ecx,%eax,8), %xmm6
+                aesdec 4(%ecx,%eax,8),%xmm6
+
+// CHECK: 	aesdeclast	%xmm7, %xmm0
+                aesdeclast %xmm7,%xmm0
+
+// CHECK: 	aesdeclast	3405691582, %xmm0
+                aesdeclast 0xcafebabe,%xmm0
+
+// CHECK: 	aeskeygenassist	$125, %xmm1, %xmm2
+                aeskeygenassist $125, %xmm1, %xmm2
+
+// CHECK: 	aeskeygenassist	$125, (%edx,%eax,4), %xmm2
+                aeskeygenassist $125, (%edx,%eax,4), %xmm2
diff --git a/test/MC/AsmParser/X86/x86_32-encoding.s b/test/MC/AsmParser/X86/x86_32-encoding.s
index 2088aa7bd551..22627585220f 100644
--- a/test/MC/AsmParser/X86/x86_32-encoding.s
+++ b/test/MC/AsmParser/X86/x86_32-encoding.s
@@ -9905,3 +9905,59 @@
 // CHECK: crc32l 	%ecx, %ecx
 // CHECK:  encoding: [0xf2,0x0f,0x38,0xf1,0xc9]
                 crc32l %ecx,%ecx
+
+// CHECK: pcmpistrm	$125, %xmm1, %xmm2
+// CHECK:  encoding: [0x66,0x0f,0x3a,0x62,0xd1,0x7d]
+                pcmpistrm $125, %xmm1, %xmm2
+
+// CHECK: pcmpistrm	$125, (%edx,%eax,4), %xmm2
+// CHECK:  encoding: [0x66,0x0f,0x3a,0x62,0x14,0x82,0x7d]
+                pcmpistrm $125, (%edx,%eax,4), %xmm2
+
+// CHECK: aesimc	%xmm0, %xmm1
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdb,0xc8]
+                aesimc %xmm0,%xmm1
+
+// CHECK: aesimc	(%eax), %xmm1
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdb,0x08]
+                aesimc (%eax),%xmm1
+
+// CHECK: aesenc	%xmm1, %xmm2
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdc,0xd1]
+                aesenc %xmm1,%xmm2
+
+// CHECK: aesenc	4(%ebx), %xmm2
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdc,0x53,0x04]
+                aesenc 4(%ebx),%xmm2
+
+// CHECK: aesenclast	%xmm3, %xmm4
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdd,0xe3]
+                aesenclast %xmm3,%xmm4
+
+// CHECK: aesenclast	4(%edx,%edi), %xmm4
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdd,0x64,0x3a,0x04]
+                aesenclast 4(%edx,%edi),%xmm4
+
+// CHECK: aesdec	%xmm5, %xmm6
+// CHECK:  encoding: [0x66,0x0f,0x38,0xde,0xf5]
+                aesdec %xmm5,%xmm6
+
+// CHECK: aesdec	4(%ecx,%eax,8), %xmm6
+// CHECK:  encoding: [0x66,0x0f,0x38,0xde,0x74,0xc1,0x04]
+                aesdec 4(%ecx,%eax,8),%xmm6
+
+// CHECK: aesdeclast	%xmm7, %xmm0
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdf,0xc7]
+                aesdeclast %xmm7,%xmm0
+
+// CHECK: aesdeclast	3405691582, %xmm0
+// CHECK:  encoding: [0x66,0x0f,0x38,0xdf,0x05,0xbe,0xba,0xfe,0xca]
+                aesdeclast 0xcafebabe,%xmm0
+
+// CHECK: aeskeygenassist	$125, %xmm1, %xmm2
+// CHECK:  encoding: [0x66,0x0f,0x3a,0xdf,0xd1,0x7d]
+                aeskeygenassist $125, %xmm1, %xmm2
+
+// CHECK: aeskeygenassist	$125, (%edx,%eax,4), %xmm2
+// CHECK:  encoding: [0x66,0x0f,0x3a,0xdf,0x14,0x82,0x7d]
+                aeskeygenassist $125, (%edx,%eax,4), %xmm2
diff --git a/test/MC/MachO/absolutize.s b/test/MC/MachO/absolutize.s
index ade5c195addc..76acd5bccfbe 100644
--- a/test/MC/MachO/absolutize.s
+++ b/test/MC/MachO/absolutize.s
@@ -1,27 +1,4 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | macho-dump | FileCheck %s
-
-// CHECK: # Relocation 0
-// CHECK: (('word-0', 0xa0000028),
-// CHECK:  ('word-1', 0x2b)),
-// CHECK: # Relocation 1
-// CHECK: (('word-0', 0xa4000020),
-// CHECK:  ('word-1', 0x37)),
-// CHECK: # Relocation 2
-// CHECK: (('word-0', 0xa1000000),
-// CHECK:  ('word-1', 0x33)),
-// CHECK: # Relocation 3
-// CHECK: (('word-0', 0xa4000018),
-// CHECK:  ('word-1', 0x33)),
-// CHECK: # Relocation 4
-// CHECK: (('word-0', 0xa1000000),
-// CHECK:  ('word-1', 0x2f)),
-// CHECK: # Relocation 5
-// CHECK: (('word-0', 0xa4000010),
-// CHECK:  ('word-1', 0x2b)),
-// CHECK: # Relocation 6
-// CHECK: (('word-0', 0xa1000000),
-// CHECK:  ('word-1', 0x2f)),
-// CHECK-NEXT: ])
+// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
 
 _text_a:
         xorl %eax,%eax
@@ -69,3 +46,168 @@ Ldata_expr_2 = Ldata_d - Ldata_c
         .long Ldata_expr_2
 
         .long _data_a + Ldata_expr_0
+
+// CHECK: ('cputype', 7)
+// CHECK: ('cpusubtype', 3)
+// CHECK: ('filetype', 1)
+// CHECK: ('num_load_commands', 1)
+// CHECK: ('load_commands_size', 296)
+// CHECK: ('flag', 0)
+// CHECK: ('load_commands', [
+// CHECK:   # Load Command 0
+// CHECK:  (('command', 1)
+// CHECK:   ('size', 192)
+// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:   ('vm_addr', 0)
+// CHECK:   ('vm_size', 87)
+// CHECK:   ('file_offset', 324)
+// CHECK:   ('file_size', 87)
+// CHECK:   ('maxprot', 7)
+// CHECK:   ('initprot', 7)
+// CHECK:   ('num_sections', 2)
+// CHECK:   ('flags', 0)
+// CHECK:   ('sections', [
+// CHECK:     # Section 0
+// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('address', 0)
+// CHECK:     ('size', 43)
+// CHECK:     ('offset', 324)
+// CHECK:     ('alignment', 0)
+// CHECK:     ('reloc_offset', 412)
+// CHECK:     ('num_reloc', 7)
+// CHECK:     ('flags', 0x80000400)
+// CHECK:     ('reserved1', 0)
+// CHECK:     ('reserved2', 0)
+// CHECK:    ),
+// CHECK:   ('_relocations', [
+// CHECK:     # Relocation 0
+// CHECK:     (('word-0', 0xa0000027),
+// CHECK:      ('word-1', 0x0)),
+// CHECK:     # Relocation 1
+// CHECK:     (('word-0', 0xa400001d),
+// CHECK:      ('word-1', 0x6)),
+// CHECK:     # Relocation 2
+// CHECK:     (('word-0', 0xa1000000),
+// CHECK:      ('word-1', 0x4)),
+// CHECK:     # Relocation 3
+// CHECK:     (('word-0', 0xa4000013),
+// CHECK:      ('word-1', 0x4)),
+// CHECK:     # Relocation 4
+// CHECK:     (('word-0', 0xa1000000),
+// CHECK:      ('word-1', 0x2)),
+// CHECK:     # Relocation 5
+// CHECK:     (('word-0', 0xa4000009),
+// CHECK:      ('word-1', 0x0)),
+// CHECK:     # Relocation 6
+// CHECK:     (('word-0', 0xa1000000),
+// CHECK:      ('word-1', 0x2)),
+// CHECK:   ])
+// CHECK:   ('_section_data', '1\xc01\xc01\xc01\xc0\xb8\xfe\xff\xff\xff\xb8\xfe\xff\xff\xff\xb8\x02\x00\x00\x00\xb8\x02\x00\x00\x00\xb8\x02\x00\x00\x00\xb8\x02\x00\x00\x00\xb8\xfe\xff\xff\xff')
+// CHECK:     # Section 1
+// CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('address', 43)
+// CHECK:     ('size', 44)
+// CHECK:     ('offset', 367)
+// CHECK:     ('alignment', 0)
+// CHECK:     ('reloc_offset', 468)
+// CHECK:     ('num_reloc', 7)
+// CHECK:     ('flags', 0x0)
+// CHECK:     ('reserved1', 0)
+// CHECK:     ('reserved2', 0)
+// CHECK:    ),
+// CHECK:   ('_relocations', [
+// CHECK:     # Relocation 0
+// CHECK:     (('word-0', 0xa0000028),
+// CHECK:      ('word-1', 0x2b)),
+// CHECK:     # Relocation 1
+// CHECK:     (('word-0', 0xa4000020),
+// CHECK:      ('word-1', 0x37)),
+// CHECK:     # Relocation 2
+// CHECK:     (('word-0', 0xa1000000),
+// CHECK:      ('word-1', 0x33)),
+// CHECK:     # Relocation 3
+// CHECK:     (('word-0', 0xa4000018),
+// CHECK:      ('word-1', 0x33)),
+// CHECK:     # Relocation 4
+// CHECK:     (('word-0', 0xa1000000),
+// CHECK:      ('word-1', 0x2f)),
+// CHECK:     # Relocation 5
+// CHECK:     (('word-0', 0xa4000010),
+// CHECK:      ('word-1', 0x2b)),
+// CHECK:     # Relocation 6
+// CHECK:     (('word-0', 0xa1000000),
+// CHECK:      ('word-1', 0x2f)),
+// CHECK:   ])
+// CHECK:   ('_section_data', "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfc\xff\xff\xff\xfc\xff\xff\xff\x04\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00'\x00\x00\x00")
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 1
+// CHECK:  (('command', 2)
+// CHECK:   ('size', 24)
+// CHECK:   ('symoff', 524)
+// CHECK:   ('nsyms', 4)
+// CHECK:   ('stroff', 572)
+// CHECK:   ('strsize', 36)
+// CHECK:   ('_string_data', '\x00_text_a\x00_text_b\x00_data_a\x00_data_b\x00\x00\x00\x00')
+// CHECK:   ('_symbols', [
+// CHECK:     # Symbol 0
+// CHECK:    (('n_strx', 1)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 0)
+// CHECK:     ('_string', '_text_a')
+// CHECK:    ),
+// CHECK:     # Symbol 1
+// CHECK:    (('n_strx', 9)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 2)
+// CHECK:     ('_string', '_text_b')
+// CHECK:    ),
+// CHECK:     # Symbol 2
+// CHECK:    (('n_strx', 17)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 2)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 43)
+// CHECK:     ('_string', '_data_a')
+// CHECK:    ),
+// CHECK:     # Symbol 3
+// CHECK:    (('n_strx', 25)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 2)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 47)
+// CHECK:     ('_string', '_data_b')
+// CHECK:    ),
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 2
+// CHECK:  (('command', 11)
+// CHECK:   ('size', 80)
+// CHECK:   ('ilocalsym', 0)
+// CHECK:   ('nlocalsym', 4)
+// CHECK:   ('iextdefsym', 4)
+// CHECK:   ('nextdefsym', 0)
+// CHECK:   ('iundefsym', 4)
+// CHECK:   ('nundefsym', 0)
+// CHECK:   ('tocoff', 0)
+// CHECK:   ('ntoc', 0)
+// CHECK:   ('modtaboff', 0)
+// CHECK:   ('nmodtab', 0)
+// CHECK:   ('extrefsymoff', 0)
+// CHECK:   ('nextrefsyms', 0)
+// CHECK:   ('indirectsymoff', 0)
+// CHECK:   ('nindirectsyms', 0)
+// CHECK:   ('extreloff', 0)
+// CHECK:   ('nextrel', 0)
+// CHECK:   ('locreloff', 0)
+// CHECK:   ('nlocrel', 0)
+// CHECK:   ('_indirect_symbols', [
+// CHECK:   ])
+// CHECK:  ),
+// CHECK: ])
diff --git a/test/MC/MachO/darwin-x86_64-reloc.s b/test/MC/MachO/darwin-x86_64-reloc.s
index 6b325b085eca..d5e75d121452 100644
--- a/test/MC/MachO/darwin-x86_64-reloc.s
+++ b/test/MC/MachO/darwin-x86_64-reloc.s
@@ -38,25 +38,29 @@ L_pc:
 L1:
  	.quad L1 - _prev
 
+        .data
+.long	_foobar@GOTPCREL+4
+.long	_foo@GOTPCREL+4
+
 // CHECK: ('cputype', 16777223)
 // CHECK: ('cpusubtype', 3)
 // CHECK: ('filetype', 1)
 // CHECK: ('num_load_commands', 1)
-// CHECK: ('load_commands_size', 256)
+// CHECK: ('load_commands_size', 336)
 // CHECK: ('flag', 0)
 // CHECK: ('reserved', 0)
 // CHECK: ('load_commands', [
 // CHECK:   # Load Command 0
 // CHECK:  (('command', 25)
-// CHECK:   ('size', 152)
+// CHECK:   ('size', 232)
 // CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
 // CHECK:   ('vm_addr', 0)
-// CHECK:   ('vm_size', 181)
-// CHECK:   ('file_offset', 288)
-// CHECK:   ('file_size', 181)
+// CHECK:   ('vm_size', 189)
+// CHECK:   ('file_offset', 368)
+// CHECK:   ('file_size', 189)
 // CHECK:   ('maxprot', 7)
 // CHECK:   ('initprot', 7)
-// CHECK:   ('num_sections', 1)
+// CHECK:   ('num_sections', 2)
 // CHECK:   ('flags', 0)
 // CHECK:   ('sections', [
 // CHECK:     # Section 0
@@ -64,9 +68,9 @@ L1:
 // CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
 // CHECK:     ('address', 0)
 // CHECK:     ('size', 181)
-// CHECK:     ('offset', 288)
+// CHECK:     ('offset', 368)
 // CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 472)
+// CHECK:     ('reloc_offset', 560)
 // CHECK:     ('num_reloc', 27)
 // CHECK:     ('flags', 0x80000400)
 // CHECK:     ('reserved1', 0)
@@ -157,19 +161,42 @@ L1:
 // CHECK:      ('word-1', 0x2d000000)),
 // CHECK:   ])
 // CHECK:   ('_section_data', '\xc3\xe8\x00\x00\x00\x00\xe8\x04\x00\x00\x00H\x8b\x05\x00\x00\x00\x00\xff5\x00\x00\x00\x00\x8b\x05\x00\x00\x00\x00\x8b\x05\x04\x00\x00\x00\xc6\x05\xff\xff\xff\xff\x12\xc7\x05\xfc\xff\xff\xffxV4\x12\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00H\x8d\x05,\x00\x00\x00H\x8d\x05\x14\x00\x00\x00\x83\x05\x13\x00\x00\x00\x06f\x81\x05\x12\x00\x00\x00\xf4\x01\x81\x05\x10\x00\x00\x00\xf4\x01\x00\x00\x90\x90\x90\x90\x90\x90\x90\x90\x90\x90\x90\x90,\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\xe4\xff\xff\xff\xff\xff\xff\xff\xd4\xff\xff\xff\xff\xff\xff\xff,\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     # Section 1
+// CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('address', 181)
+// CHECK:     ('size', 8)
+// CHECK:     ('offset', 549)
+// CHECK:     ('alignment', 0)
+// CHECK:     ('reloc_offset', 776)
+// CHECK:     ('num_reloc', 2)
+// CHECK:     ('flags', 0x0)
+// CHECK:     ('reserved1', 0)
+// CHECK:     ('reserved2', 0)
+// CHECK:     ('reserved3', 0)
+// CHECK:    ),
+// CHECK:   ('_relocations', [
+// CHECK:     # Relocation 0
+// CHECK:     (('word-0', 0x4),
+// CHECK:      ('word-1', 0x4d000000)),
+// CHECK:     # Relocation 1
+// CHECK:     (('word-0', 0x0),
+// CHECK:      ('word-1', 0x4d000004)),
+// CHECK:   ])
+// CHECK:   ('_section_data', '\x04\x00\x00\x00\x04\x00\x00\x00')
 // CHECK:   ])
 // CHECK:  ),
 // CHECK:   # Load Command 1
 // CHECK:  (('command', 2)
 // CHECK:   ('size', 24)
-// CHECK:   ('symoff', 688)
-// CHECK:   ('nsyms', 4)
-// CHECK:   ('stroff', 752)
-// CHECK:   ('strsize', 24)
-// CHECK:   ('_string_data', '\x00_foo\x00_baz\x00_bar\x00_prev\x00\x00\x00')
+// CHECK:   ('symoff', 792)
+// CHECK:   ('nsyms', 5)
+// CHECK:   ('stroff', 872)
+// CHECK:   ('strsize', 32)
+// CHECK:   ('_string_data', '\x00_foobar\x00_foo\x00_baz\x00_bar\x00_prev\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 9)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -177,7 +204,7 @@ L1:
 // CHECK:     ('_string', '_foo')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 6)
+// CHECK:    (('n_strx', 14)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -185,7 +212,7 @@ L1:
 // CHECK:     ('_string', '_baz')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 11)
+// CHECK:    (('n_strx', 19)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -193,13 +220,21 @@ L1:
 // CHECK:     ('_string', '_bar')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 16)
+// CHECK:    (('n_strx', 24)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
 // CHECK:     ('n_value', 129)
 // CHECK:     ('_string', '_prev')
 // CHECK:    ),
+// CHECK:     # Symbol 4
+// CHECK:    (('n_strx', 1)
+// CHECK:     ('n_type', 0x1)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 0)
+// CHECK:     ('_string', '_foobar')
+// CHECK:    ),
 // CHECK:   ])
 // CHECK:  ),
 // CHECK:   # Load Command 2
@@ -210,7 +245,7 @@ L1:
 // CHECK:   ('iextdefsym', 4)
 // CHECK:   ('nextdefsym', 0)
 // CHECK:   ('iundefsym', 4)
-// CHECK:   ('nundefsym', 0)
+// CHECK:   ('nundefsym', 1)
 // CHECK:   ('tocoff', 0)
 // CHECK:   ('ntoc', 0)
 // CHECK:   ('modtaboff', 0)
diff --git a/test/TableGen/2010-03-24-PrematureDefaults.td b/test/TableGen/2010-03-24-PrematureDefaults.td
new file mode 100644
index 000000000000..2ff2d42d2730
--- /dev/null
+++ b/test/TableGen/2010-03-24-PrematureDefaults.td
@@ -0,0 +1,44 @@
+// RUN: tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class A<int k, bits<2> x = 1> {
+  int K = k;
+  bits<2> Bits = x;
+}
+
+// CHECK: def a1
+// CHECK: Bits = { 0, 1 }
+def a1 : A<12>;
+
+// CHECK: def a2
+// CHECK: Bits = { 1, 0 }
+def a2 : A<13, 2>;
+
+// Here was the bug: X.Bits would get resolved to the default a1.Bits while
+// resolving the first template argument. When the second template argument
+// was processed, X would be set correctly, but Bits retained the default
+// value.
+class B<int k, A x = a1> {
+  A X = x;
+  bits<2> Bits = X.Bits;
+}
+
+// CHECK: def b1
+// CHECK: Bits = { 0, 1 }
+def b1 : B<27>;
+
+// CHECK: def b2
+// CHECK: Bits = { 1, 0 }
+def b2 : B<28, a2>;
+
+class C<A x = a1> {
+  bits<2> Bits = x.Bits;
+}
+
+// CHECK: def c1
+// CHECK: Bits = { 0, 1 }
+def c1 : C;
+
+// CHECK: def c2
+// CHECK: Bits = { 1, 0 }
+def c2 : C<a2>;
diff --git a/test/Transforms/GVN/2010-03-31-RedundantPHIs.ll b/test/Transforms/GVN/2010-03-31-RedundantPHIs.ll
new file mode 100644
index 000000000000..066e3038b087
--- /dev/null
+++ b/test/Transforms/GVN/2010-03-31-RedundantPHIs.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -gvn -enable-full-load-pre -S | FileCheck %s
+
+define i8* @cat(i8* %s1, ...) nounwind {
+entry:
+  br i1 undef, label %bb, label %bb3
+
+bb:                                               ; preds = %entry
+  unreachable
+
+bb3:                                              ; preds = %entry
+  store i8* undef, i8** undef, align 4
+  br i1 undef, label %bb5, label %bb6
+
+bb5:                                              ; preds = %bb3
+  unreachable
+
+bb6:                                              ; preds = %bb3
+  br label %bb12
+
+bb8:                                              ; preds = %bb12
+  br i1 undef, label %bb9, label %bb10
+
+bb9:                                              ; preds = %bb8
+  %0 = load i8** undef, align 4                   ; <i8*> [#uses=0]
+  %1 = load i8** undef, align 4                   ; <i8*> [#uses=0]
+  br label %bb11
+
+bb10:                                             ; preds = %bb8
+  br label %bb11
+
+bb11:                                             ; preds = %bb10, %bb9
+; CHECK: bb11:
+; CHECK: phi
+; CHECK-NOT: phi
+  br label %bb12
+
+bb12:                                             ; preds = %bb11, %bb6
+; CHECK: bb12:
+; CHECK: phi
+; CHECK-NOT: phi
+  br i1 undef, label %bb8, label %bb13
+
+bb13:                                             ; preds = %bb12
+; CHECK: bb13:
+  ret i8* undef
+}
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index d419fd24240c..d656c1a5fcc7 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -531,4 +531,16 @@ out:
 }
 
 
+; PR6642
+define i32 @memset_to_load() nounwind readnone {
+entry:
+  %x = alloca [256 x i32], align 4                ; <[256 x i32]*> [#uses=2]
+  %tmp = bitcast [256 x i32]* %x to i8*           ; <i8*> [#uses=1]
+  call void @llvm.memset.i64(i8* %tmp, i8 0, i64 1024, i32 4)
+  %arraydecay = getelementptr inbounds [256 x i32]* %x, i32 0, i32 0 ; <i32*>
+  %tmp1 = load i32* %arraydecay                   ; <i32> [#uses=1]
+  ret i32 %tmp1
+; CHECK: @memset_to_load
+; CHECK: ret i32 0
+}
 
diff --git a/test/Transforms/Inline/noinline.ll b/test/Transforms/Inline/noinline.ll
new file mode 100644
index 000000000000..dc3f6e003094
--- /dev/null
+++ b/test/Transforms/Inline/noinline.ll
@@ -0,0 +1,18 @@
+; RUN: opt %s -inline -S | FileCheck %s
+; PR6682
+declare void @foo() nounwind
+
+define void @bar() nounwind {
+entry:
+    tail call void @foo() nounwind
+    ret void
+}
+
+define void @bazz() nounwind {
+entry:
+    tail call void @bar() nounwind noinline
+    ret void
+}
+
+; CHECK: define void @bazz()
+; CHECK: call void @bar()
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index cd7b7c80679b..f8b2ffca01ed 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -118,6 +118,7 @@ entry:
   ret i32 0
 }
 
+; rdar://7782496
 @s = external global i8*
 
 define void @test5(i32 %n) nounwind ssp {
@@ -127,11 +128,23 @@ entry:
   %1 = tail call i32 @llvm.objectsize.i32(i8* %0, i1 false)
   %2 = load i8** @s, align 8
 ; CHECK-NOT: @llvm.objectsize
-; CHECK: @__memcpy_chk(i8* %0, i8* %1, i32 10, i32 20)
+; CHECK: @llvm.memcpy.i32(i8* %0, i8* %1, i32 10, i32 1)
   %3 = tail call i8* @__memcpy_chk(i8* %0, i8* %2, i32 10, i32 %1) nounwind
   ret void
 }
 
+define void @test6(i32 %n) nounwind ssp {
+; CHECK: @test6
+entry:
+  %0 = tail call noalias i8* @malloc(i32 20) nounwind
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %0, i1 false)
+  %2 = load i8** @s, align 8
+; CHECK-NOT: @llvm.objectsize
+; CHECK: @__memcpy_chk(i8* %0, i8* %1, i32 30, i32 20)
+  %3 = tail call i8* @__memcpy_chk(i8* %0, i8* %2, i32 30, i32 %1) nounwind
+  ret void
+}
+
 declare i8* @__memset_chk(i8*, i32, i64, i64) nounwind
 
 declare noalias i8* @malloc(i32) nounwind
diff --git a/test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll b/test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll
new file mode 100644
index 000000000000..ced89cf4c2b3
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll
@@ -0,0 +1,18 @@
+; RUN: opt %s -simplifycfg -disable-output
+; END.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @bar(i32)
+
+define void @foo() {
+entry:
+ invoke void @bar(i32 undef)
+         to label %r unwind label %u
+
+r:                                                ; preds = %entry
+ ret void
+
+u:                                                ; preds = %entry
+ unwind
+}
diff --git a/test/Transforms/SimplifyLibCalls/StrCpy.ll b/test/Transforms/SimplifyLibCalls/StrCpy.ll
index 75429842cd21..c3cc58ce3806 100644
--- a/test/Transforms/SimplifyLibCalls/StrCpy.ll
+++ b/test/Transforms/SimplifyLibCalls/StrCpy.ll
@@ -1,30 +1,37 @@
 ; Test that the StrCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*strcpy}
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
 
 ; This transformation requires the pointer size, as it assumes that size_t is
 ; the size of a pointer.
-target datalayout = "-p:64:64:64"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
+@hello = constant [6 x i8] c"hello\00"
 
 declare i8* @strcpy(i8*, i8*)
 
-declare i32 @puts(i8*)
+declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
 
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strcpy( i8* %arg1, i8* %arg2 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strcpy( i8* %rslt1, i8* %arg3 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strcpy( i8* %rslt2, i8* %arg4 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
+
+; rdar://6839935
+
+define i32 @t1() {
+; CHECK: @t1
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strcpy( i8* %arg1, i8* %arg2 )
+; CHECK: @llvm.memcpy.i32
+  ret i32 0
 }
 
+define i32 @t2() {
+; CHECK: @t2
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
+  %rslt1 = call i8* @__strcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
+; CHECK: @__memcpy_chk
+  ret i32 0
+}
diff --git a/tools/Makefile b/tools/Makefile
index 58130570d7aa..e124422cd6de 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -22,7 +22,6 @@ PARALLEL_DIRS := opt llvm-as llvm-dis \
                  lli llvm-extract \
                  bugpoint llvm-bcanalyzer llvm-stub \
                  llvm-mc llvmc
-                 
 
 # Let users override the set of tools to build from the command line.
 ifdef ONLY_TOOLS
@@ -38,7 +37,7 @@ ifeq ($(ENABLE_PIC),1)
   # No support for dynamic libraries on windows targets.
   ifneq ($(TARGET_OS), $(filter $(TARGET_OS), Cygwin MingW))
     PARALLEL_DIRS += edis
-    
+
     # gold only builds if binutils is around.  It requires "lto" to build before
     # it so it is added to DIRS.
     ifdef BINUTILS_INCDIR
@@ -54,4 +53,9 @@ ifeq ($(filter $(TARGETS_TO_BUILD), X86),)
   PARALLEL_DIRS := $(filter-out edis, $(PARALLEL_DIRS))
 endif
 
+# Don't build edis if we explicitly disabled it.
+ifeq ($(DISABLE_EDIS),1)
+  PARALLEL_DIRS := $(filter-out edis, $(PARALLEL_DIRS))
+endif
+
 include $(LEVEL)/Makefile.common
diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index 813c96cc795d..d676a4374ac6 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Support/IRReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Host.h"
@@ -75,6 +74,10 @@ BugDriver::BugDriver(const char *toolname, bool as_child, bool find_bugs,
     run_as_child(as_child), run_find_bugs(find_bugs), Timeout(timeout), 
     MemoryLimit(memlimit), UseValgrind(use_valgrind) {}
 
+BugDriver::~BugDriver() {
+  delete Program;
+}
+
 
 /// ParseInputFile - Given a bitcode or assembly input filename, parse and
 /// return it, or return null if not possible.
diff --git a/tools/bugpoint/BugDriver.h b/tools/bugpoint/BugDriver.h
index 0a10a61c2159..abc249868de2 100644
--- a/tools/bugpoint/BugDriver.h
+++ b/tools/bugpoint/BugDriver.h
@@ -65,6 +65,7 @@ public:
   BugDriver(const char *toolname, bool as_child, bool find_bugs,
             unsigned timeout, unsigned memlimit, bool use_valgrind,
             LLVMContext& ctxt);
+  ~BugDriver();
 
   const char *getToolName() const { return ToolName; }
 
diff --git a/tools/edis/Makefile b/tools/edis/Makefile
index cd8f4b067ae8..7f7b097e2ce3 100644
--- a/tools/edis/Makefile
+++ b/tools/edis/Makefile
@@ -25,26 +25,23 @@ LINK_COMPONENTS := $(TARGETS_TO_BUILD) x86asmprinter x86disassembler
 include $(LEVEL)/Makefile.common
 
 ifeq ($(HOST_OS),Darwin)
-    # set dylib internal version number to llvmCore submission number
-    ifdef LLVM_SUBMIT_VERSION
-        LLVMLibsOptions := $(LLVMLibsOptions) -Wl,-current_version \
-                        -Wl,$(LLVM_SUBMIT_VERSION).$(LLVM_SUBMIT_SUBVERSION) \
-                        -Wl,-compatibility_version -Wl,1
-    endif
     # extra options to override libtool defaults 
     LLVMLibsOptions    := $(LLVMLibsOptions)  \
                          -avoid-version \
                          -Wl,-exported_symbols_list -Wl,$(PROJ_SRC_DIR)/EnhancedDisassembly.exports \
-                         -Wl,-dead_strip \
-                         -Wl,-seg1addr -Wl,0xE0000000 
+                         -Wl,-dead_strip
+
+    ifdef EDIS_VERSION
+        LLVMLibsOptions    := -Wl,-current_version -Wl,$(EDIS_VERSION) \
+                              -Wl,-compatibility_version -Wl,1
+    endif
 
     # Mac OS X 10.4 and earlier tools do not allow a second -install_name on command line
-    # Path is /Developer/usr/local/lib for now; will use an rpath-based mechanism soon
     DARWIN_VERS := $(shell echo $(TARGET_TRIPLE) | sed 's/.*darwin\([0-9]*\).*/\1/')
     ifneq ($(DARWIN_VERS),8)
        LLVMLibsOptions    := $(LLVMLibsOptions)  \
                             -no-undefined -Wl,-install_name \
-                            -Wl,"/Developer/usr/local/lib/lib$(LIBRARYNAME)$(SHLIBEXT)"
+                            -Wl,"@rpath/lib$(LIBRARYNAME)$(SHLIBEXT)"
     endif
 endif
 
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 810ba4220099..aa977a7032c4 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -23,13 +23,10 @@
 #include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
 #include "llvm/Config/config.h"
-#include "llvm/LinkAllVMCore.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/System/Host.h"
@@ -39,7 +36,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetSelect.h"
-#include "llvm/Transforms/Scalar.h"
 #include <memory>
 using namespace llvm;
 
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index 231634c84bdf..276dfd638b52 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/IRReader.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SystemUtils.h"
diff --git a/tools/llvm-ld/Optimize.cpp b/tools/llvm-ld/Optimize.cpp
index 6143dc87d356..3fb0079dfede 100644
--- a/tools/llvm-ld/Optimize.cpp
+++ b/tools/llvm-ld/Optimize.cpp
@@ -13,17 +13,12 @@
 
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/StandardPasses.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/DynamicLibrary.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/PassNameParser.h"
 #include "llvm/Support/PluginLoader.h"
 using namespace llvm;
diff --git a/tools/llvm-ld/llvm-ld.cpp b/tools/llvm-ld/llvm-ld.cpp
index 118f6b720c6e..29b74b8a72bf 100644
--- a/tools/llvm-ld/llvm-ld.cpp
+++ b/tools/llvm-ld/llvm-ld.cpp
@@ -125,13 +125,18 @@ static cl::opt<std::string> CO9("m", cl::Hidden,
 /// everywhere.
 static std::string progname;
 
+/// FileRemover objects to clean up output files in the event of an error.
+static FileRemover OutputRemover;
+static FileRemover BitcodeOutputRemover;
+
 /// PrintAndExit - Prints a message to standard error and exits with error code
 ///
 /// Inputs:
 ///  Message  - The message to print to standard error.
 ///
-static void PrintAndExit(const std::string &Message, int errcode = 1) {
+static void PrintAndExit(const std::string &Message, Module *M, int errcode = 1) {
   errs() << progname << ": " << Message << "\n";
+  delete M;
   llvm_shutdown();
   exit(errcode);
 }
@@ -234,17 +239,10 @@ void GenerateBitcode(Module* M, const std::string& FileName) {
   raw_fd_ostream Out(FileName.c_str(), ErrorInfo,
                      raw_fd_ostream::F_Binary);
   if (!ErrorInfo.empty())
-    PrintAndExit(ErrorInfo);
-
-  // Ensure that the bitcode file gets removed from the disk if we get a
-  // terminating signal.
-  sys::RemoveFileOnSignal(sys::Path(FileName));
+    PrintAndExit(ErrorInfo, M);
 
   // Write it out
   WriteBitcodeToFile(M, Out);
-
-  // Close the bitcode file.
-  Out.close();
 }
 
 /// GenerateAssembly - generates a native assembly language source file from the
@@ -408,7 +406,7 @@ static int GenerateNative(const std::string &OutputFilename,
 
 /// EmitShellScript - Output the wrapper file that invokes the JIT on the LLVM
 /// bitcode file for the program.
-static void EmitShellScript(char **argv) {
+static void EmitShellScript(char **argv, Module *M) {
   if (Verbose)
     outs() << "Emitting Shell Script\n";
 #if defined(_WIN32) || defined(__CYGWIN__)
@@ -419,10 +417,10 @@ static void EmitShellScript(char **argv) {
   sys::Path llvmstub = FindExecutable("llvm-stub.exe", argv[0],
                                       (void *)(intptr_t)&Optimize);
   if (llvmstub.isEmpty())
-    PrintAndExit("Could not find llvm-stub.exe executable!");
+    PrintAndExit("Could not find llvm-stub.exe executable!", M);
 
   if (0 != sys::CopyFile(sys::Path(OutputFilename), llvmstub, &ErrMsg))
-    PrintAndExit(ErrMsg);
+    PrintAndExit(ErrMsg, M);
 
   return;
 #endif
@@ -431,7 +429,7 @@ static void EmitShellScript(char **argv) {
   std::string ErrorInfo;
   raw_fd_ostream Out2(OutputFilename.c_str(), ErrorInfo);
   if (!ErrorInfo.empty())
-    PrintAndExit(ErrorInfo);
+    PrintAndExit(ErrorInfo, M);
 
   Out2 << "#!/bin/sh\n";
   // Allow user to setenv LLVMINTERP if lli is not in their PATH.
@@ -470,7 +468,6 @@ static void EmitShellScript(char **argv) {
       Out2 << "    -load=" << FullLibraryPath.str() << " \\\n";
   }
   Out2 << "    "  << BitcodeOutputFilename << " ${1+\"$@\"}\n";
-  Out2.close();
 }
 
 // BuildLinkItems -- This function generates a LinkItemList for the LinkItems
@@ -521,6 +518,39 @@ int main(int argc, char **argv, char **envp) {
   // Parse the command line options
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
+#if defined(_WIN32) || defined(__CYGWIN__)
+  if (!LinkAsLibrary) {
+    // Default to "a.exe" instead of "a.out".
+    if (OutputFilename.getNumOccurrences() == 0)
+      OutputFilename = "a.exe";
+
+    // If there is no suffix add an "exe" one.
+    sys::Path ExeFile( OutputFilename );
+    if (ExeFile.getSuffix() == "") {
+      ExeFile.appendSuffix("exe");
+      OutputFilename = ExeFile.str();
+    }
+  }
+#endif
+
+  // Generate the bitcode for the optimized module.
+  // If -b wasn't specified, use the name specified
+  // with -o to construct BitcodeOutputFilename.
+  if (BitcodeOutputFilename.empty()) {
+    BitcodeOutputFilename = OutputFilename;
+    if (!LinkAsLibrary) BitcodeOutputFilename += ".bc";
+  }
+
+  // Arrange for the bitcode output file to be deleted on any errors.
+  BitcodeOutputRemover.setFile(sys::Path(BitcodeOutputFilename));
+  sys::RemoveFileOnSignal(sys::Path(BitcodeOutputFilename));
+
+  // Arrange for the output file to be deleted on any errors.
+  if (!LinkAsLibrary) {
+    OutputRemover.setFile(sys::Path(OutputFilename));
+    sys::RemoveFileOnSignal(sys::Path(OutputFilename));
+  }
+
   // Construct a Linker (now that Verbose is set)
   Linker TheLinker(progname, OutputFilename, Context, Verbose);
 
@@ -563,29 +593,7 @@ int main(int argc, char **argv, char **envp) {
   // Optimize the module
   Optimize(Composite.get());
 
-#if defined(_WIN32) || defined(__CYGWIN__)
-  if (!LinkAsLibrary) {
-    // Default to "a.exe" instead of "a.out".
-    if (OutputFilename.getNumOccurrences() == 0)
-      OutputFilename = "a.exe";
-
-    // If there is no suffix add an "exe" one.
-    sys::Path ExeFile( OutputFilename );
-    if (ExeFile.getSuffix() == "") {
-      ExeFile.appendSuffix("exe");
-      OutputFilename = ExeFile.str();
-    }
-  }
-#endif
-
-  // Generate the bitcode for the optimized module.
-  // If -b wasn't specified, use the name specified
-  // with -o to construct BitcodeOutputFilename.
-  if (BitcodeOutputFilename.empty()) {
-    BitcodeOutputFilename = OutputFilename;
-    if (!LinkAsLibrary) BitcodeOutputFilename += ".bc";
-  }
-
+  // Generate the bitcode output.
   GenerateBitcode(Composite.get(), BitcodeOutputFilename);
 
   // If we are not linking a library, generate either a native executable
@@ -601,13 +609,13 @@ int main(int argc, char **argv, char **envp) {
           prog = sys::Program::FindProgramByName(*I);
           if (prog.isEmpty())
             PrintAndExit(std::string("Optimization program '") + *I +
-              "' is not found or not executable.");
+                         "' is not found or not executable.", Composite.get());
         }
         // Get the program arguments
         sys::Path tmp_output("opt_result");
         std::string ErrMsg;
         if (tmp_output.createTemporaryFileOnDisk(true, &ErrMsg))
-          PrintAndExit(ErrMsg);
+          PrintAndExit(ErrMsg, Composite.get());
 
         const char* args[4];
         args[0] = I->c_str();
@@ -615,15 +623,16 @@ int main(int argc, char **argv, char **envp) {
         args[2] = tmp_output.c_str();
         args[3] = 0;
         if (0 == sys::Program::ExecuteAndWait(prog, args, 0,0,0,0, &ErrMsg)) {
-          if (tmp_output.isBitcodeFile() || tmp_output.isBitcodeFile()) {
+          if (tmp_output.isBitcodeFile()) {
             sys::Path target(BitcodeOutputFilename);
             target.eraseFromDisk();
             if (tmp_output.renamePathOnDisk(target, &ErrMsg))
-              PrintAndExit(ErrMsg, 2);
+              PrintAndExit(ErrMsg, Composite.get(), 2);
           } else
-            PrintAndExit("Post-link optimization output is not bitcode");
+            PrintAndExit("Post-link optimization output is not bitcode",
+                         Composite.get());
         } else {
-          PrintAndExit(ErrMsg);
+          PrintAndExit(ErrMsg, Composite.get());
         }
       }
     }
@@ -637,79 +646,77 @@ int main(int argc, char **argv, char **envp) {
       sys::Path AssemblyFile ( OutputFilename);
       AssemblyFile.appendSuffix("s");
 
-      // Mark the output files for removal if we get an interrupt.
+      // Mark the output files for removal.
+      FileRemover AssemblyFileRemover(AssemblyFile);
       sys::RemoveFileOnSignal(AssemblyFile);
-      sys::RemoveFileOnSignal(sys::Path(OutputFilename));
 
       // Determine the locations of the llc and gcc programs.
       sys::Path llc = FindExecutable("llc", argv[0],
                                      (void *)(intptr_t)&Optimize);
       if (llc.isEmpty())
-        PrintAndExit("Failed to find llc");
+        PrintAndExit("Failed to find llc", Composite.get());
 
       sys::Path gcc = sys::Program::FindProgramByName("gcc");
       if (gcc.isEmpty())
-        PrintAndExit("Failed to find gcc");
+        PrintAndExit("Failed to find gcc", Composite.get());
 
       // Generate an assembly language file for the bitcode.
       std::string ErrMsg;
       if (0 != GenerateAssembly(AssemblyFile.str(), BitcodeOutputFilename,
           llc, ErrMsg))
-        PrintAndExit(ErrMsg);
+        PrintAndExit(ErrMsg, Composite.get());
 
       if (0 != GenerateNative(OutputFilename, AssemblyFile.str(),
                               NativeLinkItems, gcc, envp, ErrMsg))
-        PrintAndExit(ErrMsg);
-
-      // Remove the assembly language file.
-      AssemblyFile.eraseFromDisk();
+        PrintAndExit(ErrMsg, Composite.get());
     } else if (NativeCBE) {
       sys::Path CFile (OutputFilename);
       CFile.appendSuffix("cbe.c");
 
-      // Mark the output files for removal if we get an interrupt.
+      // Mark the output files for removal.
+      FileRemover CFileRemover(CFile);
       sys::RemoveFileOnSignal(CFile);
-      sys::RemoveFileOnSignal(sys::Path(OutputFilename));
 
       // Determine the locations of the llc and gcc programs.
       sys::Path llc = FindExecutable("llc", argv[0],
                                      (void *)(intptr_t)&Optimize);
       if (llc.isEmpty())
-        PrintAndExit("Failed to find llc");
+        PrintAndExit("Failed to find llc", Composite.get());
 
       sys::Path gcc = sys::Program::FindProgramByName("gcc");
       if (gcc.isEmpty())
-        PrintAndExit("Failed to find gcc");
+        PrintAndExit("Failed to find gcc", Composite.get());
 
       // Generate an assembly language file for the bitcode.
       std::string ErrMsg;
       if (GenerateCFile(CFile.str(), BitcodeOutputFilename, llc, ErrMsg))
-        PrintAndExit(ErrMsg);
+        PrintAndExit(ErrMsg, Composite.get());
 
       if (GenerateNative(OutputFilename, CFile.str(), 
                          NativeLinkItems, gcc, envp, ErrMsg))
-        PrintAndExit(ErrMsg);
-
-      // Remove the assembly language file.
-      CFile.eraseFromDisk();
-
+        PrintAndExit(ErrMsg, Composite.get());
     } else {
-      EmitShellScript(argv);
+      EmitShellScript(argv, Composite.get());
     }
 
     // Make the script executable...
     std::string ErrMsg;
     if (sys::Path(OutputFilename).makeExecutableOnDisk(&ErrMsg))
-      PrintAndExit(ErrMsg);
+      PrintAndExit(ErrMsg, Composite.get());
 
     // Make the bitcode file readable and directly executable in LLEE as well
     if (sys::Path(BitcodeOutputFilename).makeExecutableOnDisk(&ErrMsg))
-      PrintAndExit(ErrMsg);
+      PrintAndExit(ErrMsg, Composite.get());
 
     if (sys::Path(BitcodeOutputFilename).makeReadableOnDisk(&ErrMsg))
-      PrintAndExit(ErrMsg);
+      PrintAndExit(ErrMsg, Composite.get());
   }
 
+  // Operations which may fail are now complete.
+  BitcodeOutputRemover.releaseFile();
+  if (!LinkAsLibrary)
+    OutputRemover.releaseFile();
+
   // Graceful exit
   return 0;
 }
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index fae4d107b1a2..c60e56ae9b6e 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SystemUtils.h"
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 3c23990af6bf..6dce5cc96c74 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -55,7 +55,11 @@ static cl::opt<unsigned>
 OutputAsmVariant("output-asm-variant",
                  cl::desc("Syntax variant to use for output printing"));
 
+static cl::opt<bool>
+RelaxAll("mc-relax-all", cl::desc("Relax all fixups"));
+
 enum OutputFileType {
+  OFT_Null,
   OFT_AssemblyFile,
   OFT_ObjectFile
 };
@@ -65,6 +69,8 @@ FileType("filetype", cl::init(OFT_AssemblyFile),
   cl::values(
        clEnumValN(OFT_AssemblyFile, "asm",
                   "Emit an assembly ('.s') file"),
+       clEnumValN(OFT_Null, "null",
+                  "Don't emit anything (for timing purposes)"),
        clEnumValN(OFT_ObjectFile, "obj",
                   "Emit a native object ('.o') file"),
        clEnumValEnd));
@@ -289,11 +295,13 @@ static int AssembleInput(const char *ProgName) {
       CE.reset(TheTarget->createCodeEmitter(*TM, Ctx));
     Str.reset(createAsmStreamer(Ctx, *Out,TM->getTargetData()->isLittleEndian(),
                                 /*asmverbose*/true, IP, CE.get(), ShowInst));
+  } else if (FileType == OFT_Null) {
+    Str.reset(createNullStreamer(Ctx));
   } else {
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
     CE.reset(TheTarget->createCodeEmitter(*TM, Ctx));
     TAB.reset(TheTarget->createAsmBackend(TripleName));
-    Str.reset(createMachOStreamer(Ctx, *TAB, *Out, CE.get()));
+    Str.reset(createMachOStreamer(Ctx, *TAB, *Out, CE.get(), RelaxAll));
   }
 
   AsmParser Parser(SrcMgr, Ctx, *Str.get(), *MAI);
diff --git a/tools/llvmc/plugins/Base/Base.td.in b/tools/llvmc/plugins/Base/Base.td.in
index ac0f665925c8..3ad07c0d83a1 100644
--- a/tools/llvmc/plugins/Base/Base.td.in
+++ b/tools/llvmc/plugins/Base/Base.td.in
@@ -233,6 +233,8 @@ def llvm_gcc_assembler : Tool<
           (switch_on "c"), (stop_compilation),
           (not_empty "arch"), (forward "arch"),
           (not_empty "Xassembler"), (forward "Xassembler"),
+          (switch_on "m32"), (forward "m32"),
+          (switch_on "m64"), (forward "m64"),
           (not_empty "Wa,"), (forward "Wa,")))
 ]>;
 
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index 15753d350ff8..bc65b3afc28a 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -24,8 +24,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Verifier.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -36,8 +34,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetSelect.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index b123d54eeeaa..311f67173f37 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRReader.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/StandardPasses.h"
@@ -424,9 +423,9 @@ int main(int argc, char **argv) {
   if (TD)
     Passes.add(TD);
 
-  FunctionPassManager *FPasses = NULL;
+  OwningPtr<FunctionPassManager> FPasses;
   if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
-    FPasses = new FunctionPassManager(M.get());
+    FPasses.reset(new FunctionPassManager(M.get()));
     if (TD)
       FPasses->add(new TargetData(*TD));
   }
diff --git a/unittests/ADT/SmallVectorTest.cpp b/unittests/ADT/SmallVectorTest.cpp
index d7dc3af52feb..991c7d6caac7 100644
--- a/unittests/ADT/SmallVectorTest.cpp
+++ b/unittests/ADT/SmallVectorTest.cpp
@@ -14,6 +14,7 @@
 #include "gtest/gtest.h"
 #include "llvm/ADT/SmallVector.h"
 #include <stdarg.h>
+#include <list>
 
 using namespace llvm;
 
@@ -399,4 +400,9 @@ TEST_F(SmallVectorTest, DirectVectorTest) {
   EXPECT_EQ(4, theVector[3].getValue());
 }
 
+TEST_F(SmallVectorTest, IteratorTest) {
+  std::list<int> L;
+  theVector.insert(theVector.end(), L.begin(), L.end());
+}
+
 }
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 4cc9b79a291e..d2c0195c6dcf 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -61,9 +61,9 @@ EEVT::TypeSet::TypeSet(const std::vector<MVT::SimpleValueType> &VTList) {
     assert(VTList[0] != MVT::iAny && VTList[0] != MVT::vAny &&
            VTList[0] != MVT::fAny);
   
-  // Remove duplicates.
+  // Verify no duplicates.
   array_pod_sort(TypeVec.begin(), TypeVec.end());
-  TypeVec.erase(std::unique(TypeVec.begin(), TypeVec.end()), TypeVec.end());
+  assert(std::unique(TypeVec.begin(), TypeVec.end()) == TypeVec.end());
 }
 
 /// FillWithPossibleTypes - Set to all legal types and return true, only valid
@@ -394,24 +394,39 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
 }
 
 /// EnforceVectorEltTypeIs - 'this' is now constrainted to be a vector type
-/// whose element is VT.
-bool EEVT::TypeSet::EnforceVectorEltTypeIs(MVT::SimpleValueType VT,
+/// whose element is specified by VTOperand.
+bool EEVT::TypeSet::EnforceVectorEltTypeIs(EEVT::TypeSet &VTOperand,
                                            TreePattern &TP) {
-  TypeSet InputSet(*this);
+  // "This" must be a vector and "VTOperand" must be a scalar.
   bool MadeChange = false;
+  MadeChange |= EnforceVector(TP);
+  MadeChange |= VTOperand.EnforceScalar(TP);
+
+  // If we know the vector type, it forces the scalar to agree.
+  if (isConcrete()) {
+    EVT IVT = getConcrete();
+    IVT = IVT.getVectorElementType();
+    return MadeChange | 
+      VTOperand.MergeInTypeInfo(IVT.getSimpleVT().SimpleTy, TP);
+  }
+
+  // If the scalar type is known, filter out vector types whose element types
+  // disagree.
+  if (!VTOperand.isConcrete())
+    return MadeChange;
   
-  // If we know nothing, then get the full set.
-  if (TypeVec.empty())
-    MadeChange = FillWithPossibleTypes(TP, isVector, "vector");
+  MVT::SimpleValueType VT = VTOperand.getConcrete();
   
-  // Filter out all the non-vector types and types which don't have the right
-  // element type.
-  for (unsigned i = 0; i != TypeVec.size(); ++i)
-    if (!isVector(TypeVec[i]) ||
-        EVT(TypeVec[i]).getVectorElementType().getSimpleVT().SimpleTy != VT) {
+  TypeSet InputSet(*this);
+  
+  // Filter out all the types which don't have the right element type.
+  for (unsigned i = 0; i != TypeVec.size(); ++i) {
+    assert(isVector(TypeVec[i]) && "EnforceVector didn't work");
+    if (EVT(TypeVec[i]).getVectorElementType().getSimpleVT().SimpleTy != VT) {
       TypeVec.erase(TypeVec.begin()+i--);
       MadeChange = true;
     }
+  }
   
   if (TypeVec.empty())  // FIXME: Really want an SMLoc here!
     TP.error("Type inference contradiction found, forcing '" +
@@ -476,6 +491,59 @@ void DumpDepVars(MultipleUseVarSet &DepVars) {
 // PatternToMatch implementation
 //
 
+
+/// getPatternSize - Return the 'size' of this pattern.  We want to match large
+/// patterns before small ones.  This is used to determine the size of a
+/// pattern.
+static unsigned getPatternSize(const TreePatternNode *P,
+                               const CodeGenDAGPatterns &CGP) {
+  unsigned Size = 3;  // The node itself.
+  // If the root node is a ConstantSDNode, increases its size.
+  // e.g. (set R32:$dst, 0).
+  if (P->isLeaf() && dynamic_cast<IntInit*>(P->getLeafValue()))
+    Size += 2;
+  
+  // FIXME: This is a hack to statically increase the priority of patterns
+  // which maps a sub-dag to a complex pattern. e.g. favors LEA over ADD.
+  // Later we can allow complexity / cost for each pattern to be (optionally)
+  // specified. To get best possible pattern match we'll need to dynamically
+  // calculate the complexity of all patterns a dag can potentially map to.
+  const ComplexPattern *AM = P->getComplexPatternInfo(CGP);
+  if (AM)
+    Size += AM->getNumOperands() * 3;
+  
+  // If this node has some predicate function that must match, it adds to the
+  // complexity of this node.
+  if (!P->getPredicateFns().empty())
+    ++Size;
+  
+  // Count children in the count if they are also nodes.
+  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) {
+    TreePatternNode *Child = P->getChild(i);
+    if (!Child->isLeaf() && Child->getNumTypes() &&
+        Child->getType(0) != MVT::Other)
+      Size += getPatternSize(Child, CGP);
+    else if (Child->isLeaf()) {
+      if (dynamic_cast<IntInit*>(Child->getLeafValue())) 
+        Size += 5;  // Matches a ConstantSDNode (+3) and a specific value (+2).
+      else if (Child->getComplexPatternInfo(CGP))
+        Size += getPatternSize(Child, CGP);
+      else if (!Child->getPredicateFns().empty())
+        ++Size;
+    }
+  }
+  
+  return Size;
+}
+
+/// Compute the complexity metric for the input pattern.  This roughly
+/// corresponds to the number of nodes that are covered.
+unsigned PatternToMatch::
+getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
+  return getPatternSize(getSrcPattern(), CGP) + getAddedComplexity();
+}
+
+
 /// getPredicateCheck - Return a single string containing all of this
 /// pattern's predicates concatenated with "&&" operators.
 ///
@@ -509,6 +577,9 @@ SDTypeConstraint::SDTypeConstraint(Record *R) {
   if (R->isSubClassOf("SDTCisVT")) {
     ConstraintType = SDTCisVT;
     x.SDTCisVT_Info.VT = getValueType(R->getValueAsDef("VT"));
+    if (x.SDTCisVT_Info.VT == MVT::isVoid)
+      throw TGError(R->getLoc(), "Cannot use 'Void' as type to SDTCisVT");
+      
   } else if (R->isSubClassOf("SDTCisPtrTy")) {
     ConstraintType = SDTCisPtrTy;
   } else if (R->isSubClassOf("SDTCisInt")) {
@@ -568,13 +639,6 @@ static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N,
 bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
                                            const SDNodeInfo &NodeInfo,
                                            TreePattern &TP) const {
-  // Check that the number of operands is sane.  Negative operands -> varargs.
-  if (NodeInfo.getNumOperands() >= 0) {
-    if (N->getNumChildren() != (unsigned)NodeInfo.getNumOperands())
-      TP.error(N->getOperator()->getName() + " node requires exactly " +
-               itostr(NodeInfo.getNumOperands()) + " operands!");
-  }
-
   unsigned ResNo = 0; // The result number being referenced.
   TreePatternNode *NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo);
   
@@ -612,22 +676,15 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
       TP.error(N->getOperator()->getName() + " expects a VT operand!");
     MVT::SimpleValueType VT =
      getValueType(static_cast<DefInit*>(NodeToApply->getLeafValue())->getDef());
-    if (!isInteger(VT))
-      TP.error(N->getOperator()->getName() + " VT operand must be integer!");
+    
+    EEVT::TypeSet TypeListTmp(VT, TP);
     
     unsigned OResNo = 0;
     TreePatternNode *OtherNode =
       getOperandNum(x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo,
                     OResNo);
-    
-    // It must be integer.
-    bool MadeChange = OtherNode->getExtType(OResNo).EnforceInteger(TP);
 
-    // This doesn't try to enforce any information on the OtherNode, it just
-    // validates it when information is determined.
-    if (OtherNode->hasTypeSet(OResNo) && OtherNode->getType(OResNo) <= VT)
-      OtherNode->UpdateNodeType(OResNo, MVT::Other, TP);  // Throw an error.
-    return MadeChange;
+    return TypeListTmp.EnforceSmallerThan(OtherNode->getExtType(OResNo), TP);
   }
   case SDTCisOpSmallerThanOp: {
     unsigned BResNo = 0;
@@ -642,22 +699,11 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
     TreePatternNode *VecOperand =
       getOperandNum(x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo,
                     VResNo);
-    if (VecOperand->hasTypeSet(VResNo)) {
-      if (!isVector(VecOperand->getType(VResNo)))
-        TP.error(N->getOperator()->getName() + " VT operand must be a vector!");
-      EVT IVT = VecOperand->getType(VResNo);
-      IVT = IVT.getVectorElementType();
-      return NodeToApply->UpdateNodeType(ResNo, IVT.getSimpleVT().SimpleTy, TP);
-    }
     
-    if (NodeToApply->hasTypeSet(ResNo) &&
-        VecOperand->getExtType(VResNo).hasVectorTypes()){
-      // Filter vector types out of VecOperand that don't have the right element
-      // type.
-      return VecOperand->getExtType(VResNo).
-        EnforceVectorEltTypeIs(NodeToApply->getType(ResNo), TP);
-    }
-    return false;
+    // Filter vector types out of VecOperand that don't have the right element
+    // type.
+    return VecOperand->getExtType(VResNo).
+      EnforceVectorEltTypeIs(NodeToApply->getExtType(ResNo), TP);
   }
   }  
   return false;
@@ -716,10 +762,11 @@ SDNodeInfo::SDNodeInfo(Record *R) : Def(R) {
 /// getKnownType - If the type constraints on this node imply a fixed type
 /// (e.g. all stores return void, etc), then return it as an
 /// MVT::SimpleValueType.  Otherwise, return EEVT::Other.
-MVT::SimpleValueType SDNodeInfo::getKnownType() const {
+MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
   unsigned NumResults = getNumResults();
   assert(NumResults <= 1 &&
          "We only work with nodes with zero or one result so far!");
+  assert(ResNo == 0 && "Only handles single result nodes so far");
   
   for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i) {
     // Make sure that this applies to the correct node result.
@@ -750,16 +797,11 @@ TreePatternNode::~TreePatternNode() {
 
 static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
   if (Operator->getName() == "set" ||
-      Operator->getName() == "implicit" ||
-      Operator->getName() == "parallel")
+      Operator->getName() == "implicit")
     return 0;  // All return nothing.
   
-  if (Operator->isSubClassOf("Intrinsic")) {
-    unsigned NumRes = CDP.getIntrinsic(Operator).IS.RetVTs.size();
-    if (NumRes == 1 && CDP.getIntrinsic(Operator).IS.RetVTs[0] == MVT::isVoid)
-      return 0;
-    return NumRes;
-  }
+  if (Operator->isSubClassOf("Intrinsic"))
+    return CDP.getIntrinsic(Operator).IS.RetVTs.size();
   
   if (Operator->isSubClassOf("SDNode"))
     return CDP.getSDNodeInfo(Operator).getNumResults();
@@ -782,21 +824,14 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
   
   if (Operator->isSubClassOf("Instruction")) {
     CodeGenInstruction &InstInfo = CDP.getTargetInfo().getInstruction(Operator);
+
+    // FIXME: Should allow access to all the results here.
+    unsigned NumDefsToAdd = InstInfo.NumDefs ? 1 : 0;
     
-    // FIXME: Handle implicit defs right.
-    if (InstInfo.NumDefs != 0)
-      return 1;     // FIXME: Handle inst results right!
-    
-    if (!InstInfo.ImplicitDefs.empty()) {
-      // Add on one implicit def if it has a resolvable type.
-      Record *FirstImplicitDef = InstInfo.ImplicitDefs[0];
-      assert(FirstImplicitDef->isSubClassOf("Register"));
-      const std::vector<MVT::SimpleValueType> &RegVTs = 
-      CDP.getTargetInfo().getRegisterVTs(FirstImplicitDef);
-      if (RegVTs.size() == 1)
-        return 1;
-    }
-    return 0;
+    // Add on one implicit def if it has a resolvable type.
+    if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) !=MVT::Other)
+      ++NumDefsToAdd;
+    return NumDefsToAdd;
   }
   
   if (Operator->isSubClassOf("SDNodeXForm"))
@@ -1000,34 +1035,49 @@ TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
 ///
 static EEVT::TypeSet getImplicitType(Record *R, unsigned ResNo,
                                      bool NotRegisters, TreePattern &TP) {
-  assert(ResNo == 0 && "FIXME: Unhandled result number");
-  
   // Check to see if this is a register or a register class.
   if (R->isSubClassOf("RegisterClass")) {
+    assert(ResNo == 0 && "Regclass ref only has one result!");
     if (NotRegisters) 
       return EEVT::TypeSet(); // Unknown.
     const CodeGenTarget &T = TP.getDAGPatterns().getTargetInfo();
     return EEVT::TypeSet(T.getRegisterClass(R).getValueTypes());
-  } else if (R->isSubClassOf("PatFrag")) {
+  }
+  
+  if (R->isSubClassOf("PatFrag")) {
+    assert(ResNo == 0 && "FIXME: PatFrag with multiple results?");
     // Pattern fragment types will be resolved when they are inlined.
     return EEVT::TypeSet(); // Unknown.
-  } else if (R->isSubClassOf("Register")) {
+  }
+  
+  if (R->isSubClassOf("Register")) {
+    assert(ResNo == 0 && "Registers only produce one result!");
     if (NotRegisters) 
       return EEVT::TypeSet(); // Unknown.
     const CodeGenTarget &T = TP.getDAGPatterns().getTargetInfo();
     return EEVT::TypeSet(T.getRegisterVTs(R));
-  } else if (R->isSubClassOf("ValueType") || R->isSubClassOf("CondCode")) {
+  }
+  
+  if (R->isSubClassOf("ValueType") || R->isSubClassOf("CondCode")) {
+    assert(ResNo == 0 && "This node only has one result!");
     // Using a VTSDNode or CondCodeSDNode.
     return EEVT::TypeSet(MVT::Other, TP);
-  } else if (R->isSubClassOf("ComplexPattern")) {
+  }
+  
+  if (R->isSubClassOf("ComplexPattern")) {
+    assert(ResNo == 0 && "FIXME: ComplexPattern with multiple results?");
     if (NotRegisters) 
       return EEVT::TypeSet(); // Unknown.
    return EEVT::TypeSet(TP.getDAGPatterns().getComplexPattern(R).getValueType(),
                          TP);
-  } else if (R->isSubClassOf("PointerLikeRegClass")) {
+  }
+  if (R->isSubClassOf("PointerLikeRegClass")) {
+    assert(ResNo == 0 && "Regclass can only have one result!");
     return EEVT::TypeSet(MVT::iPTR, TP);
-  } else if (R->getName() == "node" || R->getName() == "srcvalue" ||
-             R->getName() == "zero_reg") {
+  }
+  
+  if (R->getName() == "node" || R->getName() == "srcvalue" ||
+      R->getName() == "zero_reg") {
     // Placeholder.
     return EEVT::TypeSet(); // Unknown.
   }
@@ -1175,8 +1225,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     return MadeChange;
   }
   
-  if (getOperator()->getName() == "implicit" ||
-      getOperator()->getName() == "parallel") {
+  if (getOperator()->getName() == "implicit") {
     assert(getNumTypes() == 0 && "Node doesn't produce a value");
 
     bool MadeChange = false;
@@ -1210,8 +1259,6 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     // Apply the result type to the node.
     unsigned NumRetVTs = Int->IS.RetVTs.size();
     unsigned NumParamVTs = Int->IS.ParamVTs.size();
-    if (NumRetVTs == 1 && Int->IS.RetVTs[0] == MVT::isVoid)
-      NumRetVTs = 0;
     
     for (unsigned i = 0, e = NumRetVTs; i != e; ++i)
       MadeChange |= UpdateNodeType(i, Int->IS.RetVTs[i], TP);
@@ -1237,6 +1284,12 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
   if (getOperator()->isSubClassOf("SDNode")) {
     const SDNodeInfo &NI = CDP.getSDNodeInfo(getOperator());
     
+    // Check that the number of operands is sane.  Negative operands -> varargs.
+    if (NI.getNumOperands() >= 0 &&
+        getNumChildren() != (unsigned)NI.getNumOperands())
+      TP.error(getOperator()->getName() + " node requires exactly " +
+               itostr(NI.getNumOperands()) + " operands!");
+    
     bool MadeChange = NI.ApplyTypeConstraints(this, TP);
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
       MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
@@ -1245,21 +1298,20 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
   
   if (getOperator()->isSubClassOf("Instruction")) {
     const DAGInstruction &Inst = CDP.getInstruction(getOperator());
-    unsigned ResNo = 0;
-    assert(Inst.getNumResults() <= 1 &&
-           "FIXME: Only supports zero or one result instrs!");
-
     CodeGenInstruction &InstInfo =
       CDP.getTargetInfo().getInstruction(getOperator());
     
-    EEVT::TypeSet ResultType;
-    
-    // Apply the result type to the node
-    if (InstInfo.NumDefs != 0) { // # of elements in (outs) list
-      Record *ResultNode = Inst.getResult(0);
+    bool MadeChange = false;
+
+    // Apply the result types to the node, these come from the things in the
+    // (outs) list of the instruction.
+    // FIXME: Cap at one result so far.
+    unsigned NumResultsToAdd = InstInfo.NumDefs ? 1 : 0;
+    for (unsigned ResNo = 0; ResNo != NumResultsToAdd; ++ResNo) {
+      Record *ResultNode = Inst.getResult(ResNo);
       
       if (ResultNode->isSubClassOf("PointerLikeRegClass")) {
-        ResultType = EEVT::TypeSet(MVT::iPTR, TP);
+        MadeChange |= UpdateNodeType(ResNo, MVT::iPTR, TP);
       } else if (ResultNode->getName() == "unknown") {
         // Nothing to do.
       } else {
@@ -1267,25 +1319,23 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
                "Operands should be register classes!");
         const CodeGenRegisterClass &RC = 
           CDP.getTargetInfo().getRegisterClass(ResultNode);
-        ResultType = RC.getValueTypes();
+        MadeChange |= UpdateNodeType(ResNo, RC.getValueTypes(), TP);
       }
-    } else if (!InstInfo.ImplicitDefs.empty()) {
-      // If the instruction has implicit defs, the first one defines the result
-      // type.
-      Record *FirstImplicitDef = InstInfo.ImplicitDefs[0];
-      assert(FirstImplicitDef->isSubClassOf("Register"));
-      const std::vector<MVT::SimpleValueType> &RegVTs = 
-        CDP.getTargetInfo().getRegisterVTs(FirstImplicitDef);
-      if (RegVTs.size() == 1)   // FIXME: Generalize.
-        ResultType = EEVT::TypeSet(RegVTs);
-    } else {
-      // Otherwise, the instruction produces no value result.
     }
     
-    bool MadeChange = false;
-    
-    if (!ResultType.isCompletelyUnknown())
-      MadeChange |= UpdateNodeType(ResNo, ResultType, TP);
+    // If the instruction has implicit defs, we apply the first one as a result.
+    // FIXME: This sucks, it should apply all implicit defs.
+    if (!InstInfo.ImplicitDefs.empty()) {
+      unsigned ResNo = NumResultsToAdd;
+      
+      // FIXME: Generalize to multiple possible types and multiple possible
+      // ImplicitDefs.
+      MVT::SimpleValueType VT =
+        InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo());
+      
+      if (VT != MVT::Other)
+        MadeChange |= UpdateNodeType(ResNo, VT, TP);
+    }
     
     // If this is an INSERT_SUBREG, constrain the source and destination VTs to
     // be the same.
@@ -1314,17 +1364,17 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       
       MVT::SimpleValueType VT;
       TreePatternNode *Child = getChild(ChildNo++);
-      assert(Child->getNumTypes() == 1 && "Unknown case?");
+      unsigned ChildResNo = 0;  // Instructions always use res #0 of their op.
       
       if (OperandNode->isSubClassOf("RegisterClass")) {
         const CodeGenRegisterClass &RC = 
           CDP.getTargetInfo().getRegisterClass(OperandNode);
-        MadeChange |= Child->UpdateNodeType(0, RC.getValueTypes(), TP);
+        MadeChange |= Child->UpdateNodeType(ChildResNo, RC.getValueTypes(), TP);
       } else if (OperandNode->isSubClassOf("Operand")) {
         VT = getValueType(OperandNode->getValueAsDef("Type"));
-        MadeChange |= Child->UpdateNodeType(0, VT, TP);
+        MadeChange |= Child->UpdateNodeType(ChildResNo, VT, TP);
       } else if (OperandNode->isSubClassOf("PointerLikeRegClass")) {
-        MadeChange |= Child->UpdateNodeType(0, MVT::iPTR, TP);
+        MadeChange |= Child->UpdateNodeType(ChildResNo, MVT::iPTR, TP);
       } else if (OperandNode->getName() == "unknown") {
         // Nothing to do.
       } else {
@@ -1423,13 +1473,13 @@ TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
                          CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp){
   isInputPattern = isInput;
   for (unsigned i = 0, e = RawPat->getSize(); i != e; ++i)
-    Trees.push_back(ParseTreePattern((DagInit*)RawPat->getElement(i)));
+    Trees.push_back(ParseTreePattern(RawPat->getElement(i), ""));
 }
 
 TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
                          CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp){
   isInputPattern = isInput;
-  Trees.push_back(ParseTreePattern(Pat));
+  Trees.push_back(ParseTreePattern(Pat, ""));
 }
 
 TreePattern::TreePattern(Record *TheRec, TreePatternNode *Pat, bool isInput,
@@ -1457,7 +1507,49 @@ void TreePattern::ComputeNamedNodes(TreePatternNode *N) {
 }
 
 
-TreePatternNode *TreePattern::ParseTreePattern(DagInit *Dag) {
+TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
+  if (DefInit *DI = dynamic_cast<DefInit*>(TheInit)) {
+    Record *R = DI->getDef();
+    
+    // Direct reference to a leaf DagNode or PatFrag?  Turn it into a
+    // TreePatternNode if its own.  For example:
+    ///   (foo GPR, imm) -> (foo GPR, (imm))
+    if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrag"))
+      return ParseTreePattern(new DagInit(DI, "",
+                          std::vector<std::pair<Init*, std::string> >()),
+                              OpName);
+    
+    // Input argument?
+    TreePatternNode *Res = new TreePatternNode(DI, 1);
+    if (R->getName() == "node" && !OpName.empty()) {
+      if (OpName.empty())
+        error("'node' argument requires a name to match with operand list");
+      Args.push_back(OpName);
+    }
+
+    Res->setName(OpName);
+    return Res;
+  }
+  
+  if (IntInit *II = dynamic_cast<IntInit*>(TheInit)) {
+    if (!OpName.empty())
+      error("Constant int argument should not have a name!");
+    return new TreePatternNode(II, 1);
+  }
+  
+  if (BitsInit *BI = dynamic_cast<BitsInit*>(TheInit)) {
+    // Turn this into an IntInit.
+    Init *II = BI->convertInitializerTo(new IntRecTy());
+    if (II == 0 || !dynamic_cast<IntInit*>(II))
+      error("Bits value must be constants!");
+    return ParseTreePattern(II, OpName);
+  }
+
+  DagInit *Dag = dynamic_cast<DagInit*>(TheInit);
+  if (!Dag) {
+    TheInit->dump();
+    error("Pattern has unexpected init kind!");
+  }
   DefInit *OpDef = dynamic_cast<DefInit*>(Dag->getOperator());
   if (!OpDef) error("Pattern has unexpected operator type!");
   Record *Operator = OpDef->getDef();
@@ -1468,50 +1560,14 @@ TreePatternNode *TreePattern::ParseTreePattern(DagInit *Dag) {
     if (Dag->getNumArgs() != 1)
       error("Type cast only takes one operand!");
     
-    Init *Arg = Dag->getArg(0);
-    TreePatternNode *New;
-    if (DefInit *DI = dynamic_cast<DefInit*>(Arg)) {
-      Record *R = DI->getDef();
-      if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrag")) {
-        Dag->setArg(0, new DagInit(DI, "",
-                                std::vector<std::pair<Init*, std::string> >()));
-        return ParseTreePattern(Dag);
-      }
-      
-      // Input argument?
-      if (R->getName() == "node") {
-        if (Dag->getArgName(0).empty())
-          error("'node' argument requires a name to match with operand list");
-        Args.push_back(Dag->getArgName(0));
-      }
-      
-      New = new TreePatternNode(DI, 1);
-    } else if (DagInit *DI = dynamic_cast<DagInit*>(Arg)) {
-      New = ParseTreePattern(DI);
-    } else if (IntInit *II = dynamic_cast<IntInit*>(Arg)) {
-      New = new TreePatternNode(II, 1);
-      if (!Dag->getArgName(0).empty())
-        error("Constant int argument should not have a name!");
-    } else if (BitsInit *BI = dynamic_cast<BitsInit*>(Arg)) {
-      // Turn this into an IntInit.
-      Init *II = BI->convertInitializerTo(new IntRecTy());
-      if (II == 0 || !dynamic_cast<IntInit*>(II))
-        error("Bits value must be constants!");
-      
-      New = new TreePatternNode(dynamic_cast<IntInit*>(II), 1);
-      if (!Dag->getArgName(0).empty())
-        error("Constant int argument should not have a name!");
-    } else {
-      Arg->dump();
-      error("Unknown leaf value for tree pattern!");
-      return 0;
-    }
+    TreePatternNode *New = ParseTreePattern(Dag->getArg(0), Dag->getArgName(0));
     
     // Apply the type cast.
     assert(New->getNumTypes() == 1 && "FIXME: Unhandled");
     New->UpdateNodeType(0, getValueType(Operator), *this);
-    if (New->getNumChildren() == 0)
-      New->setName(Dag->getArgName(0));
+    
+    if (!OpName.empty())
+      error("ValueType cast should not have a name!");
     return New;
   }
   
@@ -1522,65 +1578,38 @@ TreePatternNode *TreePattern::ParseTreePattern(DagInit *Dag) {
       !Operator->isSubClassOf("SDNodeXForm") &&
       !Operator->isSubClassOf("Intrinsic") &&
       Operator->getName() != "set" &&
-      Operator->getName() != "implicit" &&
-      Operator->getName() != "parallel")
+      Operator->getName() != "implicit")
     error("Unrecognized node '" + Operator->getName() + "'!");
   
   //  Check to see if this is something that is illegal in an input pattern.
-  if (isInputPattern && (Operator->isSubClassOf("Instruction") ||
-                         Operator->isSubClassOf("SDNodeXForm")))
-    error("Cannot use '" + Operator->getName() + "' in an input pattern!");
+  if (isInputPattern) {
+    if (Operator->isSubClassOf("Instruction") ||
+        Operator->isSubClassOf("SDNodeXForm"))
+      error("Cannot use '" + Operator->getName() + "' in an input pattern!");
+  } else {
+    if (Operator->isSubClassOf("Intrinsic"))
+      error("Cannot use '" + Operator->getName() + "' in an output pattern!");
+    
+    if (Operator->isSubClassOf("SDNode") &&
+        Operator->getName() != "imm" &&
+        Operator->getName() != "fpimm" &&
+        Operator->getName() != "tglobaltlsaddr" &&
+        Operator->getName() != "tconstpool" &&
+        Operator->getName() != "tjumptable" &&
+        Operator->getName() != "tframeindex" &&
+        Operator->getName() != "texternalsym" &&
+        Operator->getName() != "tblockaddress" &&
+        Operator->getName() != "tglobaladdr" &&
+        Operator->getName() != "bb" &&
+        Operator->getName() != "vt")
+      error("Cannot use '" + Operator->getName() + "' in an output pattern!");
+  }
   
   std::vector<TreePatternNode*> Children;
-  
-  for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) {
-    Init *Arg = Dag->getArg(i);
-    if (DagInit *DI = dynamic_cast<DagInit*>(Arg)) {
-      Children.push_back(ParseTreePattern(DI));
-      if (Children.back()->getName().empty())
-        Children.back()->setName(Dag->getArgName(i));
-    } else if (DefInit *DefI = dynamic_cast<DefInit*>(Arg)) {
-      Record *R = DefI->getDef();
-      // Direct reference to a leaf DagNode or PatFrag?  Turn it into a
-      // TreePatternNode if its own.
-      if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrag")) {
-        Dag->setArg(i, new DagInit(DefI, "",
-                              std::vector<std::pair<Init*, std::string> >()));
-        --i;  // Revisit this node...
-      } else {
-        TreePatternNode *Node = new TreePatternNode(DefI, 1);
-        Node->setName(Dag->getArgName(i));
-        Children.push_back(Node);
-        
-        // Input argument?
-        if (R->getName() == "node") {
-          if (Dag->getArgName(i).empty())
-            error("'node' argument requires a name to match with operand list");
-          Args.push_back(Dag->getArgName(i));
-        }
-      }
-    } else if (IntInit *II = dynamic_cast<IntInit*>(Arg)) {
-      TreePatternNode *Node = new TreePatternNode(II, 1);
-      if (!Dag->getArgName(i).empty())
-        error("Constant int argument should not have a name!");
-      Children.push_back(Node);
-    } else if (BitsInit *BI = dynamic_cast<BitsInit*>(Arg)) {
-      // Turn this into an IntInit.
-      Init *II = BI->convertInitializerTo(new IntRecTy());
-      if (II == 0 || !dynamic_cast<IntInit*>(II))
-        error("Bits value must be constants!");
-      
-      TreePatternNode *Node = new TreePatternNode(dynamic_cast<IntInit*>(II),1);
-      if (!Dag->getArgName(i).empty())
-        error("Constant int argument should not have a name!");
-      Children.push_back(Node);
-    } else {
-      errs() << '"';
-      Arg->dump();
-      errs() << "\": ";
-      error("Unknown leaf value for tree pattern!");
-    }
-  }
+
+  // Parse all the operands.
+  for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i)
+    Children.push_back(ParseTreePattern(Dag->getArg(i), Dag->getArgName(i)));
   
   // If the operator is an intrinsic, then this is just syntactic sugar for for
   // (intrinsic_* <number>, ..children..).  Pick the right intrinsic node, and 
@@ -1591,15 +1620,13 @@ TreePatternNode *TreePattern::ParseTreePattern(DagInit *Dag) {
 
     // If this intrinsic returns void, it must have side-effects and thus a
     // chain.
-    if (Int.IS.RetVTs[0] == MVT::isVoid) {
+    if (Int.IS.RetVTs.empty())
       Operator = getDAGPatterns().get_intrinsic_void_sdnode();
-    } else if (Int.ModRef != CodeGenIntrinsic::NoMem) {
+    else if (Int.ModRef != CodeGenIntrinsic::NoMem)
       // Has side-effects, requires chain.
       Operator = getDAGPatterns().get_intrinsic_w_chain_sdnode();
-    } else {
-      // Otherwise, no chain.
+    else // Otherwise, no chain.
       Operator = getDAGPatterns().get_intrinsic_wo_chain_sdnode();
-    }
     
     TreePatternNode *IIDNode = new TreePatternNode(new IntInit(IID), 1);
     Children.insert(Children.begin(), IIDNode);
@@ -1607,10 +1634,48 @@ TreePatternNode *TreePattern::ParseTreePattern(DagInit *Dag) {
   
   unsigned NumResults = GetNumNodeResults(Operator, CDP);
   TreePatternNode *Result = new TreePatternNode(Operator, Children, NumResults);
-  Result->setName(Dag->getName());
+  Result->setName(OpName);
+  
+  if (!Dag->getName().empty()) {
+    assert(Result->getName().empty());
+    Result->setName(Dag->getName());
+  }
   return Result;
 }
 
+/// SimplifyTree - See if we can simplify this tree to eliminate something that
+/// will never match in favor of something obvious that will.  This is here
+/// strictly as a convenience to target authors because it allows them to write
+/// more type generic things and have useless type casts fold away.
+///
+/// This returns true if any change is made.
+static bool SimplifyTree(TreePatternNode *&N) {
+  if (N->isLeaf())
+    return false;
+
+  // If we have a bitconvert with a resolved type and if the source and
+  // destination types are the same, then the bitconvert is useless, remove it.
+  if (N->getOperator()->getName() == "bitconvert" &&
+      N->getExtType(0).isConcrete() &&
+      N->getExtType(0) == N->getChild(0)->getExtType(0) &&
+      N->getName().empty()) {
+    N = N->getChild(0);
+    SimplifyTree(N);
+    return true;
+  }
+
+  // Walk all children.
+  bool MadeChange = false;
+  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) {
+    TreePatternNode *Child = N->getChild(i);
+    MadeChange |= SimplifyTree(Child);
+    N->setChild(i, Child);
+  }
+  return MadeChange;
+}
+
+
+
 /// InferAllTypes - Infer/propagate as many types throughout the expression
 /// patterns as possible.  Return true if all types are inferred, false
 /// otherwise.  Throw an exception if a type contradiction is found.
@@ -1622,8 +1687,10 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
-    for (unsigned i = 0, e = Trees.size(); i != e; ++i)
+    for (unsigned i = 0, e = Trees.size(); i != e; ++i) {
       MadeChange |= Trees[i]->ApplyTypeConstraints(*this, false);
+      MadeChange |= SimplifyTree(Trees[i]);
+    }
 
     // If there are constraints on our named nodes, apply them.
     for (StringMap<SmallVector<TreePatternNode*,1> >::iterator 
@@ -2542,37 +2609,7 @@ void CodeGenDAGPatterns::ParsePatterns() {
   for (unsigned i = 0, e = Patterns.size(); i != e; ++i) {
     Record *CurPattern = Patterns[i];
     DagInit *Tree = CurPattern->getValueAsDag("PatternToMatch");
-    DefInit *OpDef = dynamic_cast<DefInit*>(Tree->getOperator());
-    Record *Operator = OpDef->getDef();
-    TreePattern *Pattern;
-    if (Operator->getName() != "parallel")
-      Pattern = new TreePattern(CurPattern, Tree, true, *this);
-    else {
-      std::vector<Init*> Values;
-      RecTy *ListTy = 0;
-      for (unsigned j = 0, ee = Tree->getNumArgs(); j != ee; ++j) {
-        Values.push_back(Tree->getArg(j));
-        TypedInit *TArg = dynamic_cast<TypedInit*>(Tree->getArg(j));
-        if (TArg == 0) {
-          errs() << "In dag: " << Tree->getAsString();
-          errs() << " --  Untyped argument in pattern\n";
-          assert(0 && "Untyped argument in pattern");
-        }
-        if (ListTy != 0) {
-          ListTy = resolveTypes(ListTy, TArg->getType());
-          if (ListTy == 0) {
-            errs() << "In dag: " << Tree->getAsString();
-            errs() << " --  Incompatible types in pattern arguments\n";
-            assert(0 && "Incompatible types in pattern arguments");
-          }
-        }
-        else {
-          ListTy = TArg->getType();
-        }
-      }
-      ListInit *LI = new ListInit(Values, new ListRecTy(ListTy));
-      Pattern = new TreePattern(CurPattern, LI, true, *this);
-    }
+    TreePattern *Pattern = new TreePattern(CurPattern, Tree, true, *this);
 
     // Inline pattern fragments into it.
     Pattern->InlinePatternFragments();
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index f583f298f662..548051300dc6 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -129,7 +129,7 @@ namespace EEVT {
     
     /// EnforceVectorEltTypeIs - 'this' is now constrainted to be a vector type
     /// whose element is VT.
-    bool EnforceVectorEltTypeIs(MVT::SimpleValueType VT, TreePattern &TP);
+    bool EnforceVectorEltTypeIs(EEVT::TypeSet &VT, TreePattern &TP);
     
     bool operator!=(const TypeSet &RHS) const { return TypeVec != RHS.TypeVec; }
     bool operator==(const TypeSet &RHS) const { return TypeVec == RHS.TypeVec; }
@@ -199,6 +199,9 @@ public:
   SDNodeInfo(Record *R);  // Parse the specified record.
   
   unsigned getNumResults() const { return NumResults; }
+  
+  /// getNumOperands - This is the number of operands required or -1 if
+  /// variadic.
   int getNumOperands() const { return NumOperands; }
   Record *getRecord() const { return Def; }
   const std::string &getEnumName() const { return EnumName; }
@@ -211,7 +214,7 @@ public:
   /// getKnownType - If the type constraints on this node imply a fixed type
   /// (e.g. all stores return void, etc), then return it as an
   /// MVT::SimpleValueType.  Otherwise, return MVT::Other.
-  MVT::SimpleValueType getKnownType() const;
+  MVT::SimpleValueType getKnownType(unsigned ResNo) const;
   
   /// hasProperty - Return true if this node has the specified property.
   ///
@@ -272,7 +275,7 @@ public:
   ~TreePatternNode();
   
   const std::string &getName() const { return Name; }
-  void setName(const std::string &N) { Name = N; }
+  void setName(StringRef N) { Name.assign(N.begin(), N.end()); }
   
   bool isLeaf() const { return Val != 0; }
   
@@ -510,7 +513,7 @@ public:
   void dump() const;
   
 private:
-  TreePatternNode *ParseTreePattern(DagInit *DI);
+  TreePatternNode *ParseTreePattern(Init *DI, StringRef OpName);
   void ComputeNamedNodes();
   void ComputeNamedNodes(TreePatternNode *N);
 };
@@ -595,6 +598,10 @@ public:
   unsigned         getAddedComplexity() const { return AddedComplexity; }
 
   std::string getPredicateCheck() const;
+  
+  /// Compute the complexity metric for the input pattern.  This roughly
+  /// corresponds to the number of nodes that are covered.
+  unsigned getPatternComplexity(const CodeGenDAGPatterns &CGP) const;
 };
 
 // Deterministic comparison of Record*.
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index eea55618721e..99d196c3ceb4 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenInstruction.h"
+#include "CodeGenTarget.h"
 #include "Record.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
@@ -294,3 +295,22 @@ CodeGenInstruction::ParseOperandName(const std::string &Op,
   // Otherwise, didn't find it!
   throw TheDef->getName() + ": unknown suboperand name in '" + Op + "'";
 }
+
+
+/// HasOneImplicitDefWithKnownVT - If the instruction has at least one
+/// implicit def and it has a known VT, return the VT, otherwise return
+/// MVT::Other.
+MVT::SimpleValueType CodeGenInstruction::
+HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const {
+  if (ImplicitDefs.empty()) return MVT::Other;
+  
+  // Check to see if the first implicit def has a resolvable type.
+  Record *FirstImplicitDef = ImplicitDefs[0];
+  assert(FirstImplicitDef->isSubClassOf("Register"));
+  const std::vector<MVT::SimpleValueType> &RegVTs = 
+    TargetInfo.getRegisterVTs(FirstImplicitDef);
+  if (RegVTs.size() == 1)
+    return RegVTs[0];
+  return MVT::Other;
+}
+
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index c369123dd694..946c2d01a52f 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -22,6 +22,7 @@
 namespace llvm {
   class Record;
   class DagInit;
+  class CodeGenTarget;
 
   class CodeGenInstruction {
   public:
@@ -183,6 +184,12 @@ namespace llvm {
     /// non-empty name.  If the instruction does not have an operand with the
     /// specified name, throw an exception.
     unsigned getOperandNamed(const std::string &Name) const;
+    
+    /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
+    /// implicit def and it has a known VT, return the VT, otherwise return
+    /// MVT::Other.
+    MVT::SimpleValueType 
+      HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const;
   };
 }
 
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 79bc30d8c9ad..0392895ba489 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -18,6 +18,7 @@
 #include "CodeGenIntrinsics.h"
 #include "Record.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include <algorithm>
 using namespace llvm;
@@ -194,6 +195,10 @@ getRegisterVTs(Record *R) const {
       }
     }
   }
+  
+  // Remove duplicates.
+  array_pod_sort(Result.begin(), Result.end());
+  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
   return Result;
 }
 
@@ -488,15 +493,17 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
     }
     if (EVT(VT).isOverloaded()) {
       OverloadedVTs.push_back(VT);
-      isOverloaded |= true;
+      isOverloaded = true;
     }
+
+    // Reject invalid types.
+    if (VT == MVT::isVoid)
+      throw "Intrinsic '" + DefName + " has void in result type list!";
+    
     IS.RetVTs.push_back(VT);
     IS.RetTypeDefs.push_back(TyEl);
   }
-
-  if (IS.RetVTs.size() == 0)
-    throw "Intrinsic '"+DefName+"' needs at least a type for the ret value!";
-
+  
   // Parse the list of parameter types.
   TypeList = R->getValueAsListInit("ParamTypes");
   for (unsigned i = 0, e = TypeList->getSize(); i != e; ++i) {
@@ -517,10 +524,16 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
              "Expected iAny or vAny type");
     } else
       VT = getValueType(TyEl->getValueAsDef("VT"));
+    
     if (EVT(VT).isOverloaded()) {
       OverloadedVTs.push_back(VT);
-      isOverloaded |= true;
+      isOverloaded = true;
     }
+    
+    // Reject invalid types.
+    if (VT == MVT::isVoid && i != e-1 /*void at end means varargs*/)
+      throw "Intrinsic '" + DefName + " has void in result type list!";
+    
     IS.ParamVTs.push_back(VT);
     IS.ParamTypeDefs.push_back(TyEl);
   }
diff --git a/utils/TableGen/DAGISelEmitter.cpp b/utils/TableGen/DAGISelEmitter.cpp
index 044086add5b2..04c7710ac503 100644
--- a/utils/TableGen/DAGISelEmitter.cpp
+++ b/utils/TableGen/DAGISelEmitter.cpp
@@ -21,49 +21,6 @@ using namespace llvm;
 // DAGISelEmitter Helper methods
 //
 
-/// getPatternSize - Return the 'size' of this pattern.  We want to match large
-/// patterns before small ones.  This is used to determine the size of a
-/// pattern.
-static unsigned getPatternSize(TreePatternNode *P, CodeGenDAGPatterns &CGP) {
-  unsigned Size = 3;  // The node itself.
-  // If the root node is a ConstantSDNode, increases its size.
-  // e.g. (set R32:$dst, 0).
-  if (P->isLeaf() && dynamic_cast<IntInit*>(P->getLeafValue()))
-    Size += 2;
-
-  // FIXME: This is a hack to statically increase the priority of patterns
-  // which maps a sub-dag to a complex pattern. e.g. favors LEA over ADD.
-  // Later we can allow complexity / cost for each pattern to be (optionally)
-  // specified. To get best possible pattern match we'll need to dynamically
-  // calculate the complexity of all patterns a dag can potentially map to.
-  const ComplexPattern *AM = P->getComplexPatternInfo(CGP);
-  if (AM)
-    Size += AM->getNumOperands() * 3;
-
-  // If this node has some predicate function that must match, it adds to the
-  // complexity of this node.
-  if (!P->getPredicateFns().empty())
-    ++Size;
-  
-  // Count children in the count if they are also nodes.
-  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) {
-    TreePatternNode *Child = P->getChild(i);
-    if (!Child->isLeaf() && Child->getNumTypes() &&
-        Child->getType(0) != MVT::Other)
-      Size += getPatternSize(Child, CGP);
-    else if (Child->isLeaf()) {
-      if (dynamic_cast<IntInit*>(Child->getLeafValue())) 
-        Size += 5;  // Matches a ConstantSDNode (+3) and a specific value (+2).
-      else if (Child->getComplexPatternInfo(CGP))
-        Size += getPatternSize(Child, CGP);
-      else if (!Child->getPredicateFns().empty())
-        ++Size;
-    }
-  }
-  
-  return Size;
-}
-
 /// getResultPatternCost - Compute the number of instructions for this pattern.
 /// This is a temporary hack.  We should really include the instruction
 /// latencies in this calculation.
@@ -153,12 +110,25 @@ struct PatternSortingPredicate {
   PatternSortingPredicate(CodeGenDAGPatterns &cgp) : CGP(cgp) {}
   CodeGenDAGPatterns &CGP;
   
-  bool operator()(const PatternToMatch *LHS,
-                  const PatternToMatch *RHS) {
-    unsigned LHSSize = getPatternSize(LHS->getSrcPattern(), CGP);
-    unsigned RHSSize = getPatternSize(RHS->getSrcPattern(), CGP);
-    LHSSize += LHS->getAddedComplexity();
-    RHSSize += RHS->getAddedComplexity();
+  bool operator()(const PatternToMatch *LHS, const PatternToMatch *RHS) {
+    const TreePatternNode *LHSSrc = LHS->getSrcPattern();
+    const TreePatternNode *RHSSrc = RHS->getSrcPattern();
+    
+    if (LHSSrc->getNumTypes() != 0 && RHSSrc->getNumTypes() != 0 &&
+        LHSSrc->getType(0) != RHSSrc->getType(0)) {
+      MVT::SimpleValueType V1 = LHSSrc->getType(0), V2 = RHSSrc->getType(0);
+      if (MVT(V1).isVector() != MVT(V2).isVector())
+        return MVT(V2).isVector();
+      
+      if (MVT(V1).isFloatingPoint() != MVT(V2).isFloatingPoint())
+        return MVT(V2).isFloatingPoint();
+    }
+    
+    // Otherwise, if the patterns might both match, sort based on complexity,
+    // which means that we prefer to match patterns that cover more nodes in the
+    // input over nodes that cover fewer.
+    unsigned LHSSize = LHS->getPatternComplexity(CGP);
+    unsigned RHSSize = RHS->getPatternComplexity(CGP);
     if (LHSSize > RHSSize) return true;   // LHS -> bigger -> less cost
     if (LHSSize < RHSSize) return false;
     
@@ -173,7 +143,8 @@ struct PatternSortingPredicate {
     if (LHSPatSize < RHSPatSize) return true;
     if (LHSPatSize > RHSPatSize) return false;
     
-    // Sort based on the UID of the pattern, giving us a deterministic ordering.
+    // Sort based on the UID of the pattern, giving us a deterministic ordering
+    // if all other sorting conditions fail.
     assert(LHS == RHS || LHS->ID != RHS->ID);
     return LHS->ID < RHS->ID;
   }
@@ -208,8 +179,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
 
   // We want to process the matches in order of minimal cost.  Sort the patterns
   // so the least cost one is at the start.
-  std::stable_sort(Patterns.begin(), Patterns.end(),
-                   PatternSortingPredicate(CGP));
+  std::sort(Patterns.begin(), Patterns.end(), PatternSortingPredicate(CGP));
   
   
   // Convert each variant of each pattern into a Matcher.
diff --git a/utils/TableGen/DAGISelMatcher.cpp b/utils/TableGen/DAGISelMatcher.cpp
index cd3fad131eca..9f12a686e4cf 100644
--- a/utils/TableGen/DAGISelMatcher.cpp
+++ b/utils/TableGen/DAGISelMatcher.cpp
@@ -147,7 +147,8 @@ void SwitchOpcodeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
 
 
 void CheckTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
-  OS.indent(indent) << "CheckType " << getEnumName(Type) << '\n';
+  OS.indent(indent) << "CheckType " << getEnumName(Type) << ", ResNo="
+    << ResNo << '\n';
 }
 
 void SwitchTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
@@ -356,12 +357,11 @@ bool CheckOpcodeMatcher::isContradictoryImpl(const Matcher *M) const {
   // different, then we know they contradict.  For example, a check for
   // ISD::STORE will never be true at the same time a check for Type i32 is.
   if (const CheckTypeMatcher *CT = dyn_cast<CheckTypeMatcher>(M)) {
-    // FIXME: What result is this referring to?
-    MVT::SimpleValueType NodeType;
-    if (getOpcode().getNumResults() == 0)
-      NodeType = MVT::isVoid;
-    else
-      NodeType = getOpcode().getKnownType();
+    // If checking for a result the opcode doesn't have, it can't match.
+    if (CT->getResNo() >= getOpcode().getNumResults())
+      return true;
+    
+    MVT::SimpleValueType NodeType = getOpcode().getKnownType(CT->getResNo());
     if (NodeType != MVT::Other)
       return TypesAreContradictory(NodeType, CT->getType());
   }
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index ef7ecf400505..d9b25d556430 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -492,14 +492,16 @@ private:
 };
   
 /// CheckTypeMatcher - This checks to see if the current node has the
-/// specified type, if not it fails to match.
+/// specified type at the specified result, if not it fails to match.
 class CheckTypeMatcher : public Matcher {
   MVT::SimpleValueType Type;
+  unsigned ResNo;
 public:
-  CheckTypeMatcher(MVT::SimpleValueType type)
-    : Matcher(CheckType), Type(type) {}
+  CheckTypeMatcher(MVT::SimpleValueType type, unsigned resno)
+    : Matcher(CheckType), Type(type), ResNo(resno) {}
   
   MVT::SimpleValueType getType() const { return Type; }
+  unsigned getResNo() const { return ResNo; }
   
   static inline bool classof(const Matcher *N) {
     return N->getKind() == CheckType;
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index cabf2d438254..4473f0de9667 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -32,6 +32,7 @@ OmitComments("omit-comments", cl::desc("Do not generate comments"),
 
 namespace {
 class MatcherTableEmitter {
+  const CodeGenDAGPatterns &CGP;
   StringMap<unsigned> NodePredicateMap, PatternPredicateMap;
   std::vector<std::string> NodePredicates, PatternPredicates;
 
@@ -43,13 +44,12 @@ class MatcherTableEmitter {
   std::vector<Record*> NodeXForms;
 
 public:
-  MatcherTableEmitter() {}
+  MatcherTableEmitter(const CodeGenDAGPatterns &cgp) : CGP(cgp) {}
 
   unsigned EmitMatcherList(const Matcher *N, unsigned Indent,
                            unsigned StartIdx, formatted_raw_ostream &OS);
   
-  void EmitPredicateFunctions(const CodeGenDAGPatterns &CGP,
-                              formatted_raw_ostream &OS);
+  void EmitPredicateFunctions(formatted_raw_ostream &OS);
   
   void EmitHistogram(const Matcher *N, formatted_raw_ostream &OS);
 private:
@@ -255,9 +255,9 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   }
 
   case Matcher::CheckOpcode:
-    OS << "OPC_CheckOpcode, "
-       << cast<CheckOpcodeMatcher>(N)->getOpcode().getEnumName() << ",\n";
-    return 2;
+    OS << "OPC_CheckOpcode, TARGET_OPCODE("
+       << cast<CheckOpcodeMatcher>(N)->getOpcode().getEnumName() << "),\n";
+    return 3;
       
   case Matcher::SwitchOpcode:
   case Matcher::SwitchType: {
@@ -280,10 +280,14 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     // For each case we emit the size, then the opcode, then the matcher.
     for (unsigned i = 0, e = NumCases; i != e; ++i) {
       const Matcher *Child;
-      if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N))
+      unsigned IdxSize;
+      if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N)) {
         Child = SOM->getCaseMatcher(i);
-      else
+        IdxSize = 2;  // size of opcode in table is 2 bytes.
+      } else {
         Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
+        IdxSize = 1;  // size of type in table is 1 byte.
+      }
       
       // We need to encode the opcode and the offset of the case code before
       // emitting the case code.  Handle this by buffering the output into a
@@ -299,7 +303,8 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
         TmpBuf.clear();
         raw_svector_ostream OS(TmpBuf);
         formatted_raw_ostream FOS(OS);
-        ChildSize = EmitMatcherList(Child, Indent+1, CurrentIdx+VBRSize+1, FOS);
+        ChildSize = EmitMatcherList(Child, Indent+1, CurrentIdx+VBRSize+IdxSize,
+                                    FOS);
       } while (GetVBRSize(ChildSize) != VBRSize);
       
       assert(ChildSize != 0 && "Should not have a zero-sized child!");
@@ -316,15 +321,15 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
       
       OS << ' ';
       if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N))
-        OS << SOM->getCaseOpcode(i).getEnumName();
+        OS << "TARGET_OPCODE(" << SOM->getCaseOpcode(i).getEnumName() << "),";
       else
-        OS << getEnumName(cast<SwitchTypeMatcher>(N)->getCaseType(i));
-      OS << ',';
-      
+        OS << getEnumName(cast<SwitchTypeMatcher>(N)->getCaseType(i)) << ',';
+
+      CurrentIdx += IdxSize;
+
       if (!OmitComments)
-        OS << "// ->" << CurrentIdx+ChildSize+1;
+        OS << "// ->" << CurrentIdx+ChildSize;
       OS << '\n';
-      ++CurrentIdx;
       OS << TmpBuf.str();
       CurrentIdx += ChildSize;
     }
@@ -341,6 +346,8 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   }
 
  case Matcher::CheckType:
+    assert(cast<CheckTypeMatcher>(N)->getResNo() == 0 &&
+           "FIXME: Add support for CheckType of resno != 0");
     OS << "OPC_CheckType, "
        << getEnumName(cast<CheckTypeMatcher>(N)->getType()) << ",\n";
     return 2;
@@ -442,6 +449,13 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   case Matcher::EmitMergeInputChains: {
     const EmitMergeInputChainsMatcher *MN =
       cast<EmitMergeInputChainsMatcher>(N);
+    
+    // Handle the specialized forms OPC_EmitMergeInputChains1_0 and 1_1.
+    if (MN->getNumNodes() == 1 && MN->getNode(0) < 2) {
+      OS << "OPC_EmitMergeInputChains1_" << MN->getNode(0) << ",\n";
+      return 1;
+    }
+    
     OS << "OPC_EmitMergeInputChains, " << MN->getNumNodes() << ", ";
     for (unsigned i = 0, e = MN->getNumNodes(); i != e; ++i)
       OS << MN->getNode(i) << ", ";
@@ -507,7 +521,8 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
 
       if (const MorphNodeToMatcher *SNT = dyn_cast<MorphNodeToMatcher>(N)) {
         OS.PadToColumn(Indent*2) << "// Src: "
-          << *SNT->getPattern().getSrcPattern() << '\n';
+          << *SNT->getPattern().getSrcPattern() << " - Complexity = " 
+          << SNT->getPattern().getPatternComplexity(CGP) << '\n';
         OS.PadToColumn(Indent*2) << "// Dst: "
           << *SNT->getPattern().getDstPattern() << '\n';
       }
@@ -534,7 +549,8 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     OS << '\n';
     if (!OmitComments) {
       OS.PadToColumn(Indent*2) << "// Src: "
-        << *CM->getPattern().getSrcPattern() << '\n';
+        << *CM->getPattern().getSrcPattern() << " - Complexity = " 
+        << CM->getPattern().getPatternComplexity(CGP) << '\n';
       OS.PadToColumn(Indent*2) << "// Dst: "
         << *CM->getPattern().getDstPattern();
     }
@@ -565,8 +581,7 @@ EmitMatcherList(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   return Size;
 }
 
-void MatcherTableEmitter::EmitPredicateFunctions(const CodeGenDAGPatterns &CGP,
-                                                 formatted_raw_ostream &OS) {
+void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
   // Emit pattern predicates.
   if (!PatternPredicates.empty()) {
     OS << "bool CheckPatternPredicate(unsigned PredNo) const {\n";
@@ -760,7 +775,7 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher,
   OS << "// The main instruction selector code.\n";
   OS << "SDNode *SelectCode(SDNode *N) {\n";
 
-  MatcherTableEmitter MatcherEmitter;
+  MatcherTableEmitter MatcherEmitter(CGP);
 
   OS << "  // Opcodes are emitted as 2 bytes, TARGET_OPCODE handles this.\n";
   OS << "  #define TARGET_OPCODE(X) X & 255, unsigned(X) >> 8\n";
@@ -775,5 +790,5 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher,
   OS << '\n';
   
   // Next up, emit the function for node and pattern predicates:
-  MatcherEmitter.EmitPredicateFunctions(CGP, OS);
+  MatcherEmitter.EmitPredicateFunctions(OS);
 }
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index da6f6afd82dd..9d469a9acbee 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -408,13 +408,13 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
   // If N and NodeNoTypes don't agree on a type, then this is a case where we
   // need to do a type check.  Emit the check, apply the tyep to NodeNoTypes and
   // reinfer any correlated types.
-  bool DoTypeCheck = false;
-  if (NodeNoTypes->getNumTypes() != 0 &&
-      NodeNoTypes->getExtType(0) != N->getExtType(0)) {
-    assert(NodeNoTypes->getNumTypes() == 1 && "FIXME: Handle multiple results");
-    NodeNoTypes->setType(0, N->getExtType(0));
+  SmallVector<unsigned, 2> ResultsToTypeCheck;
+  
+  for (unsigned i = 0, e = NodeNoTypes->getNumTypes(); i != e; ++i) {
+    if (NodeNoTypes->getExtType(i) == N->getExtType(i)) continue;
+    NodeNoTypes->setType(i, N->getExtType(i));
     InferPossibleTypes();
-    DoTypeCheck = true;
+    ResultsToTypeCheck.push_back(i);
   }
   
   // If this node has a name associated with it, capture it in VariableMap. If
@@ -444,10 +444,9 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
   for (unsigned i = 0, e = N->getPredicateFns().size(); i != e; ++i)
     AddMatcher(new CheckPredicateMatcher(N->getPredicateFns()[i]));
   
-  if (DoTypeCheck) {
-    assert(N->getNumTypes() == 1);
-    AddMatcher(new CheckTypeMatcher(N->getType(0)));
-  }
+  for (unsigned i = 0, e = ResultsToTypeCheck.size(); i != e; ++i)
+    AddMatcher(new CheckTypeMatcher(N->getType(ResultsToTypeCheck[i]),
+                                    ResultsToTypeCheck[i]));
 }
 
 /// EmitMatcherCode - Generate the code that matches the predicate of this
@@ -688,9 +687,19 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       continue;
     }
     
+    const TreePatternNode *Child = N->getChild(ChildNo);
+    
     // Otherwise this is a normal operand or a predicate operand without
     // 'execute always'; emit it.
-    EmitResultOperand(N->getChild(ChildNo), InstOps);
+    unsigned BeforeAddingNumOps = InstOps.size();
+    EmitResultOperand(Child, InstOps);
+    assert(InstOps.size() > BeforeAddingNumOps && "Didn't add any operands");
+    
+    // If the operand is an instruction and it produced multiple results, just
+    // take the first one.
+    if (!Child->isLeaf() && Child->getOperator()->isSubClassOf("Instruction"))
+      InstOps.resize(BeforeAddingNumOps+1);
+    
     ++ChildNo;
   }
   
@@ -712,24 +721,19 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   
   // Determine the result types.
   SmallVector<MVT::SimpleValueType, 4> ResultVTs;
-  if (N->getNumTypes()) {
-    // FIXME2: If the node has multiple results, we should add them.  For now,
-    // preserve existing behavior?!
-    assert(N->getNumTypes() == 1);
-    ResultVTs.push_back(N->getType(0));
-  }
+  for (unsigned i = 0, e = N->getNumTypes(); i != e; ++i)
+    ResultVTs.push_back(N->getType(i));
   
   // If this is the root instruction of a pattern that has physical registers in
   // its result pattern, add output VTs for them.  For example, X86 has:
   //   (set AL, (mul ...))
   // This also handles implicit results like:
   //   (implicit EFLAGS)
-  if (isRoot && Pattern.getDstRegs().size() != 0) {
+  if (isRoot && !Pattern.getDstRegs().empty()) {
     // If the root came from an implicit def in the instruction handling stuff,
     // don't re-add it.
     Record *HandledReg = 0;
-    if (NumResults == 0 && N->getNumTypes() != 0 &&
-        !II.ImplicitDefs.empty())
+    if (II.HasOneImplicitDefWithKnownVT(CGT) != MVT::Other)
       HandledReg = II.ImplicitDefs[0];
     
     for (unsigned i = 0; i != Pattern.getDstRegs().size(); ++i) {
@@ -763,6 +767,9 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   bool NodeHasMemRefs =
     isRoot && Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
 
+  assert((!ResultVTs.empty() || TreeHasOutFlag || NodeHasChain) &&
+         "Node has no result");
+  
   AddMatcher(new EmitNodeMatcher(II.Namespace+"::"+II.TheDef->getName(),
                                  ResultVTs.data(), ResultVTs.size(),
                                  InstOps.data(), InstOps.size(),
@@ -830,33 +837,35 @@ void MatcherGen::EmitResultCode() {
 
   // At this point, we have however many values the result pattern produces.
   // However, the input pattern might not need all of these.  If there are
-  // excess values at the end (such as condition codes etc) just lop them off.
-  // This doesn't need to worry about flags or chains, just explicit results.
+  // excess values at the end (such as implicit defs of condition codes etc)
+  // just lop them off.  This doesn't need to worry about flags or chains, just
+  // explicit results.
   //
-  // FIXME2: This doesn't work because there is currently no way to get an
-  // accurate count of the # results the source pattern sets.  This is because
-  // of the "parallel" construct in X86 land, which looks like this:
-  //
-  //def : Pat<(parallel (X86and_flag GR8:$src1, GR8:$src2),
-  //           (implicit EFLAGS)),
-  //  (AND8rr GR8:$src1, GR8:$src2)>;
-  //
-  // This idiom means to match the two-result node X86and_flag (which is
-  // declared as returning a single result, because we can't match multi-result
-  // nodes yet).  In this case, we would have to know that the input has two
-  // results.  However, mul8r is modelled exactly the same way, but without
-  // implicit defs included.  The fix is to support multiple results directly
-  // and eliminate 'parallel'.
-  //
-  // FIXME2: When this is fixed, we should revert the terrible hack in the
-  // OPC_EmitNode code in the interpreter.
-#if 0
-  const TreePatternNode *Src = Pattern.getSrcPattern();
-  unsigned NumSrcResults = Src->getTypeNum(0) != MVT::isVoid ? 1 : 0;
-  NumSrcResults += Pattern.getDstRegs().size();
+  unsigned NumSrcResults = Pattern.getSrcPattern()->getNumTypes();
+  
+  // If the pattern also has (implicit) results, count them as well.
+  if (!Pattern.getDstRegs().empty()) {
+    // If the root came from an implicit def in the instruction handling stuff,
+    // don't re-add it.
+    Record *HandledReg = 0;
+    const TreePatternNode *DstPat = Pattern.getDstPattern();
+    if (!DstPat->isLeaf() &&DstPat->getOperator()->isSubClassOf("Instruction")){
+      const CodeGenTarget &CGT = CGP.getTargetInfo();
+      CodeGenInstruction &II = CGT.getInstruction(DstPat->getOperator());
+
+      if (II.HasOneImplicitDefWithKnownVT(CGT) != MVT::Other)
+        HandledReg = II.ImplicitDefs[0];
+    }
+    
+    for (unsigned i = 0; i != Pattern.getDstRegs().size(); ++i) {
+      Record *Reg = Pattern.getDstRegs()[i];
+      if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue;
+      ++NumSrcResults;
+    }
+  }    
+  
   assert(Ops.size() >= NumSrcResults && "Didn't provide enough results");
   Ops.resize(NumSrcResults);
-#endif
 
   // If the matched pattern covers nodes which define a flag result, emit a node
   // that tells the matcher about them so that it can update their results.
diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index 820ab63c1fa7..c73bdb9efb68 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -48,8 +48,9 @@ static void ContractNodes(OwningPtr<Matcher> &MatcherPtr,
         New = new RecordChildMatcher(MC->getChildNo(), RM->getWhatFor(),
                                      RM->getResultNo());
     
-    if (CheckTypeMatcher *CT= dyn_cast<CheckTypeMatcher>(MC->getNext()))
-      if (MC->getChildNo() < 8)  // Only have CheckChildType0...7
+    if (CheckTypeMatcher *CT = dyn_cast<CheckTypeMatcher>(MC->getNext()))
+      if (MC->getChildNo() < 8 &&  // Only have CheckChildType0...7
+          CT->getResNo() == 0)     // CheckChildType checks res #0
         New = new CheckChildTypeMatcher(MC->getChildNo(), CT->getType());
     
     if (New) {
@@ -420,10 +421,12 @@ static void FactorNodes(OwningPtr<Matcher> &MatcherPtr) {
       CheckTypeMatcher *CTM =
         cast_or_null<CheckTypeMatcher>(FindNodeWithKind(NewOptionsToMatch[i],
                                                         Matcher::CheckType));
-      if (CTM == 0 || 
+      if (CTM == 0 ||
           // iPTR checks could alias any other case without us knowing, don't
           // bother with them.
           CTM->getType() == MVT::iPTR ||
+          // SwitchType only works for result #0.
+          CTM->getResNo() != 0 ||
           // If the CheckType isn't at the start of the list, see if we can move
           // it there.
           !CTM->canMoveBefore(NewOptionsToMatch[i])) {
@@ -488,7 +491,7 @@ static void FactorNodes(OwningPtr<Matcher> &MatcherPtr) {
       MatcherPtr.reset(new SwitchTypeMatcher(&Cases[0], Cases.size()));
     } else {
       // If we factored and ended up with one case, create it now.
-      MatcherPtr.reset(new CheckTypeMatcher(Cases[0].first));
+      MatcherPtr.reset(new CheckTypeMatcher(Cases[0].first, 0));
       MatcherPtr->setNext(Cases[0].second);
     }
     return;
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index 23f09a5bd316..ba59e50fab28 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -296,9 +296,11 @@ void FastISelMap::CollectPatterns(CodeGenDAGPatterns &CGP) {
     if (!InstPatNode) continue;
     if (InstPatNode->isLeaf()) continue;
 
+    // Ignore multiple result nodes for now.
+    if (InstPatNode->getNumTypes() > 1) continue;
+    
     Record *InstPatOp = InstPatNode->getOperator();
     std::string OpcodeName = getOpcodeName(InstPatOp, CGP);
-    assert(InstPatNode->getNumTypes() <= 1);
     MVT::SimpleValueType RetVT = MVT::isVoid;
     if (InstPatNode->getNumTypes()) RetVT = InstPatNode->getType(0);
     MVT::SimpleValueType VT = RetVT;
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 8f7550b84810..9bc545928d9b 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -337,7 +337,7 @@ void InstrInfoEmitter::emitShiftedValue(Record *R, StringInit *Val,
                                         IntInit *ShiftInt, raw_ostream &OS) {
   if (Val == 0 || ShiftInt == 0)
     throw std::string("Illegal value or shift amount in TargetInfo*!");
-  RecordVal *RV = R->getValue(Val->getValue());
+  RecordVal *RV = R->getDottedValue(Val->getValue());
   int Shift = ShiftInt->getValue();
 
   if (RV == 0 || RV->getValue() == 0) {
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index c5df9e411c68..417aca7cb37b 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -172,10 +172,11 @@ static void EmitTypeGenerate(raw_ostream &OS, const Record *ArgType,
 static void EmitTypeGenerate(raw_ostream &OS,
                              const std::vector<Record*> &ArgTypes,
                              unsigned &ArgNo) {
-  if (ArgTypes.size() == 1) {
-    EmitTypeGenerate(OS, ArgTypes.front(), ArgNo);
-    return;
-  }
+  if (ArgTypes.empty())
+    return EmitTypeForValueType(OS, MVT::isVoid);
+  
+  if (ArgTypes.size() == 1)
+    return EmitTypeGenerate(OS, ArgTypes.front(), ArgNo);
 
   OS << "StructType::get(Context, ";
 
@@ -251,11 +252,11 @@ namespace {
       unsigned RHSSize = RHSVec->size();
       unsigned LHSSize = LHSVec->size();
 
-      do {
+      for (; i != LHSSize; ++i) {
         if (i == RHSSize) return false;  // RHS is shorter than LHS.
         if ((*LHSVec)[i] != (*RHSVec)[i])
           return (*LHSVec)[i]->getName() < (*RHSVec)[i]->getName();
-      } while (++i != LHSSize);
+      }
 
       if (i != RHSSize) return true;
 
diff --git a/utils/TableGen/Record.cpp b/utils/TableGen/Record.cpp
index b9facb4c2a6e..55c998926c0a 100644
--- a/utils/TableGen/Record.cpp
+++ b/utils/TableGen/Record.cpp
@@ -1108,12 +1108,15 @@ RecTy *VarInit::getFieldType(const std::string &FieldName) const {
   return 0;
 }
 
-Init *VarInit::getFieldInit(Record &R, const std::string &FieldName) const {
+Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
+                            const std::string &FieldName) const {
   if (dynamic_cast<RecordRecTy*>(getType()))
-    if (const RecordVal *RV = R.getValue(VarName)) {
-      Init *TheInit = RV->getValue();
+    if (const RecordVal *Val = R.getValue(VarName)) {
+      if (RV != Val && (RV || dynamic_cast<UnsetInit*>(Val->getValue())))
+        return 0;
+      Init *TheInit = Val->getValue();
       assert(TheInit != this && "Infinite loop detected!");
-      if (Init *I = TheInit->getFieldInit(R, FieldName))
+      if (Init *I = TheInit->getFieldInit(R, RV, FieldName))
         return I;
       else
         return 0;
@@ -1174,7 +1177,8 @@ RecTy *DefInit::getFieldType(const std::string &FieldName) const {
   return 0;
 }
 
-Init *DefInit::getFieldInit(Record &R, const std::string &FieldName) const {
+Init *DefInit::getFieldInit(Record &R, const RecordVal *RV,
+                            const std::string &FieldName) const {
   return Def->getValue(FieldName)->getValue();
 }
 
@@ -1185,7 +1189,7 @@ std::string DefInit::getAsString() const {
 
 Init *FieldInit::resolveBitReference(Record &R, const RecordVal *RV,
                                      unsigned Bit) {
-  if (Init *BitsVal = Rec->getFieldInit(R, FieldName))
+  if (Init *BitsVal = Rec->getFieldInit(R, RV, FieldName))
     if (BitsInit *BI = dynamic_cast<BitsInit*>(BitsVal)) {
       assert(Bit < BI->getNumBits() && "Bit reference out of range!");
       Init *B = BI->getBit(Bit);
@@ -1198,7 +1202,7 @@ Init *FieldInit::resolveBitReference(Record &R, const RecordVal *RV,
 
 Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
                                              unsigned Elt) {
-  if (Init *ListVal = Rec->getFieldInit(R, FieldName))
+  if (Init *ListVal = Rec->getFieldInit(R, RV, FieldName))
     if (ListInit *LI = dynamic_cast<ListInit*>(ListVal)) {
       if (Elt >= LI->getSize()) return 0;
       Init *E = LI->getElement(Elt);
@@ -1215,7 +1219,7 @@ Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
 Init *FieldInit::resolveReferences(Record &R, const RecordVal *RV) {
   Init *NewRec = RV ? Rec->resolveReferences(R, RV) : Rec;
 
-  Init *BitsVal = NewRec->getFieldInit(R, FieldName);
+  Init *BitsVal = NewRec->getFieldInit(R, RV, FieldName);
   if (BitsVal) {
     Init *BVR = BitsVal->resolveReferences(R, RV);
     return BVR->isComplete() ? BVR : this;
@@ -1303,6 +1307,16 @@ void Record::resolveReferencesTo(const RecordVal *RV) {
   }
 }
 
+RecordVal *Record::getDottedValue(StringRef Name) {
+  size_t pos = Name.find('.');
+  if (pos == StringRef::npos)
+    return getValue(Name);
+  RecordVal *RV = getValue(Name.substr(0, pos));
+  if (!RV) return 0;
+  DefInit *DI = dynamic_cast<DefInit*>(RV->getValue());
+  if (!DI) return 0;
+  return DI->getDef()->getDottedValue(Name.substr(pos+1));
+}
 
 void Record::dump() const { errs() << *this; }
 
diff --git a/utils/TableGen/Record.h b/utils/TableGen/Record.h
index 90096e98e206..41373c799a0d 100644
--- a/utils/TableGen/Record.h
+++ b/utils/TableGen/Record.h
@@ -503,7 +503,8 @@ struct Init {
   /// initializer for the specified field.  If getFieldType returns non-null
   /// this method should return non-null, otherwise it returns null.
   ///
-  virtual Init *getFieldInit(Record &R, const std::string &FieldName) const {
+  virtual Init *getFieldInit(Record &R, const RecordVal *RV,
+                             const std::string &FieldName) const {
     return 0;
   }
 
@@ -950,7 +951,8 @@ public:
                                             unsigned Elt);
 
   virtual RecTy *getFieldType(const std::string &FieldName) const;
-  virtual Init *getFieldInit(Record &R, const std::string &FieldName) const;
+  virtual Init *getFieldInit(Record &R, const RecordVal *RV,
+                             const std::string &FieldName) const;
 
   /// resolveReferences - This method is used by classes that refer to other
   /// variables which may not be defined at the time they expression is formed.
@@ -1035,7 +1037,8 @@ public:
   //virtual Init *convertInitializerBitRange(const std::vector<unsigned> &Bits);
 
   virtual RecTy *getFieldType(const std::string &FieldName) const;
-  virtual Init *getFieldInit(Record &R, const std::string &FieldName) const;
+  virtual Init *getFieldInit(Record &R, const RecordVal *RV,
+                             const std::string &FieldName) const;
 
   virtual std::string getAsString() const;
 
@@ -1259,6 +1262,9 @@ public:
     return 0;
   }
 
+  // Like getValue, but allow dotting into members: X.Y
+  RecordVal *getDottedValue(StringRef Name);
+
   void addTemplateArg(StringRef Name) {
     assert(!isTemplateArg(Name) && "Template arg already defined!");
     TemplateArgs.push_back(Name);
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index f20ec00aa0e5..1326ebc023f1 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -34,7 +34,6 @@
 #include "SubtargetEmitter.h"
 #include "TGParser.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/utils/buildit/build_llvm b/utils/buildit/build_llvm
index f67794e4fa96..33f471db4fc3 100755
--- a/utils/buildit/build_llvm
+++ b/utils/buildit/build_llvm
@@ -203,6 +203,7 @@ fi
 make $JOBS_FLAG $OPTIMIZE_OPTS UNIVERSAL=1 UNIVERSAL_ARCH="$TARGETS" \
     UNIVERSAL_SDK_PATH=$HOST_SDKROOT \
     NO_RUNTIME_LIBS=1 \
+    DISABLE_EDIS=1 \
     LLVM_SUBMIT_VERSION=$LLVM_SUBMIT_VERSION \
     LLVM_SUBMIT_SUBVERSION=$LLVM_SUBMIT_SUBVERSION \
     CXXFLAGS="-DLLVM_VERSION_INFO='\" Apple Build #$LLVM_VERSION\"'" \
@@ -227,6 +228,7 @@ cd $DIR/obj-llvm || exit 1
 # Install the tree into the destination directory.
 make $LOCAL_MAKEFLAGS $OPTIMIZE_OPTS UNIVERSAL=1 UNIVERSAL_ARCH="$TARGETS" \
     NO_RUNTIME_LIBS=1 \
+    DISABLE_EDIS=1 \
     LLVM_SUBMIT_VERSION=$LLVM_SUBMIT_VERSION \
     LLVM_SUBMIT_SUBVERSION=$LLVM_SUBMIT_SUBVERSION \
     OPTIMIZE_OPTION='-O3' VERBOSE=1 install
diff --git a/utils/lit/lit/LitTestCase.py b/utils/lit/lit/LitTestCase.py
new file mode 100644
index 000000000000..895118584357
--- /dev/null
+++ b/utils/lit/lit/LitTestCase.py
@@ -0,0 +1,30 @@
+import unittest
+import Test
+
+"""
+TestCase adaptor for providing a 'unittest' compatible interface to 'lit' tests.
+"""
+
+class UnresolvedError(RuntimeError):
+    pass
+        
+class LitTestCase(unittest.TestCase):
+    def __init__(self, test, lit_config):
+        unittest.TestCase.__init__(self)
+        self._test = test
+        self._lit_config = lit_config
+
+    def id(self):
+        return self._test.getFullName()
+
+    def shortDescription(self):
+        return self._test.getFullName()
+
+    def runTest(self):
+        tr, output = self._test.config.test_format.execute(
+            self._test, self._lit_config)
+
+        if tr is Test.UNRESOLVED:
+            raise UnresolvedError(output)
+        elif tr.isFailure:
+            self.fail(output)
diff --git a/utils/lit/lit/TestFormats.py b/utils/lit/lit/TestFormats.py
index 433e39a62780..7ab9bb6e452e 100644
--- a/utils/lit/lit/TestFormats.py
+++ b/utils/lit/lit/TestFormats.py
@@ -90,8 +90,9 @@ class FileBasedTest(object):
                             litConfig, localConfig):
         source_path = testSuite.getSourcePath(path_in_suite)
         for filename in os.listdir(source_path):
-            # Ignore dot files.
-            if filename.startswith('.'):
+            # Ignore dot files and excluded tests.
+            if (filename.startswith('.') or
+                filename in localConfig.excludes):
                 continue
 
             filepath = os.path.join(source_path, filename)
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 29adff222983..d10e4b030d8a 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -63,6 +63,7 @@ def executeShCmd(cmd, cfg, cwd, results):
     procs = []
     input = subprocess.PIPE
     stderrTempFiles = []
+    opened_files = []
     # To avoid deadlock, we use a single stderr stream for piped
     # output. This is null until we have seen some output using
     # stderr.
@@ -113,8 +114,11 @@ def executeShCmd(cmd, cfg, cwd, results):
                     else:
                         r[2] = open(r[0], r[1])
                     # Workaround a Win32 and/or subprocess bug when appending.
+                    #
+                    # FIXME: Actually, this is probably an instance of PR6753.
                     if r[1] == 'a':
                         r[2].seek(0, 2)
+                    opened_files.append(r[2])
                 result = r[2]
             final_redirects.append(result)
 
@@ -176,7 +180,7 @@ def executeShCmd(cmd, cfg, cwd, results):
         else:
             err = ''
         procData[i] = (out,err)
-        
+
     # Read stderr out of the temp files.
     for i,f in stderrTempFiles:
         f.seek(0, 0)
@@ -199,6 +203,10 @@ def executeShCmd(cmd, cfg, cwd, results):
         else:
             exitCode = res
 
+    # Explicitly close any redirected files.
+    for f in opened_files:
+        f.close()
+
     if cmd.negate:
         exitCode = not exitCode
 
diff --git a/utils/lit/lit/lit.py b/utils/lit/lit/lit.py
index e80075478a6d..a29fa42101cc 100755
--- a/utils/lit/lit/lit.py
+++ b/utils/lit/lit/lit.py
@@ -315,6 +315,48 @@ def runTests(numThreads, litConfig, provider, display):
     except KeyboardInterrupt:
         sys.exit(2)
 
+def load_test_suite(inputs):
+    import unittest
+
+    # Create the global config object.
+    litConfig = LitConfig.LitConfig(progname = 'lit',
+                                    path = [],
+                                    quiet = False,
+                                    useValgrind = False,
+                                    valgrindLeakCheck = False,
+                                    valgrindArgs = [],
+                                    useTclAsSh = False,
+                                    noExecute = False,
+                                    debug = False,
+                                    isWindows = (platform.system()=='Windows'),
+                                    params = {})
+
+    # Load the tests from the inputs.
+    tests = []
+    testSuiteCache = {}
+    localConfigCache = {}
+    for input in inputs:
+        prev = len(tests)
+        tests.extend(getTests(input, litConfig,
+                              testSuiteCache, localConfigCache)[1])
+        if prev == len(tests):
+            litConfig.warning('input %r contained no tests' % input)
+
+    # If there were any errors during test discovery, exit now.
+    if litConfig.numErrors:
+        print >>sys.stderr, '%d errors, exiting.' % litConfig.numErrors
+        sys.exit(2)
+
+    # Return a unittest test suite which just runs the tests in order.
+    def get_test_fn(test):
+        return unittest.FunctionTestCase(
+            lambda: test.config.test_format.execute(
+                test, litConfig),
+            description = test.getFullName())
+
+    from LitTestCase import LitTestCase
+    return unittest.TestSuite([LitTestCase(test, litConfig) for test in tests])
+
 def main():
     # Bump the GIL check interval, its more important to get any one thread to a
     # blocking operation (hopefully exec) than to try and unblock other threads.