Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix compiler crash and profitability test in memset optimisation #22

Open
wants to merge 3 commits into
base: nanomips
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ class MemorySSAUpdater;
class MemSetInst;
class StoreInst;
class TargetLibraryInfo;
class TargetTransformInfo;
class Value;

class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
MemoryDependenceResults *MD = nullptr;
TargetLibraryInfo *TLI = nullptr;
TargetTransformInfo *TTI = nullptr;
AAResults *AA = nullptr;
AssumptionCache *AC = nullptr;
DominatorTree *DT = nullptr;
Expand All @@ -56,7 +58,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
// Glue for the old PM.
bool runImpl(Function &F, MemoryDependenceResults *MD, TargetLibraryInfo *TLI,
AAResults *AA, AssumptionCache *AC, DominatorTree *DT,
MemorySSA *MSSA);
MemorySSA *MSSA, TargetTransformInfo *TTI);

private:
// Helper functions
Expand Down
128 changes: 84 additions & 44 deletions llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
Expand Down Expand Up @@ -111,12 +112,15 @@ struct MemsetRange {
/// TheStores - The actual stores that make up this range.
SmallVector<Instruction*, 16> TheStores;

bool isProfitableToUseMemset(const DataLayout &DL) const;
bool isProfitableToUseMemset(const DataLayout &DL, TargetTransformInfo *TTI,
LLVMContext *Context) const;
};

} // end anonymous namespace

bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL,
TargetTransformInfo *TTI,
LLVMContext *Context) const {
// If the merged range will take more than 16 bytes, use
// memset. This avoids the more expensive calculation of merged
// stores.
Expand All @@ -135,47 +139,75 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
// together if it wants to.
if (TheStores.size() == 2) return false;

// Estimate the number of stores that will be used to implement a
// memset range after the DAG Combiner has merged naturally-aligned
// stores.
//
// This takes account of partial alignment information, which would
// be discarded by converting to a memset. For example:
// struct A {
// char a, b, c, d, e, f, g, h;
// int counter;
// } *Ap;
// Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
//
// The overall structure alignment is 32-bits. Naively, we see 7
// single-byte stores, the first of which, b, is only known to be
// byte-aligned. However, since most architectures support 32-bit and
// 16-bit stores, these can be merged by DAGCombine into only 3
// naturally-aligned stores:
// store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
// store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
// store<(store (s32) into %ir.e)> t0, Constant:i32<0>...

int Offset = Start;
int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
int StoreCount = 0;
// Since we don't have perfect knowledge here, make some assumptions: assume
// the maximum GPR width is the same size as the largest legal integer
// size. If so, check to see whether we will end up actually reducing the
// number of stores used.
unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
if (MaxIntSize == 0)
MaxIntSize = 1;

bool AllowMisaligned = TTI->allowsMisalignedMemoryAccesses(
*Context, MaxIntSize);

if (AllowMisaligned) {
// Misaligned accesses are permitted. We can assume that inlining a
// memset() call can be inlined to MaxIntSize'd stores, plus single-byte
// stores, regardless of the alignment of the destination pointer.

unsigned Bytes = unsigned(End-Start);

while (Offset < End) {
unsigned StoreSize = 1;
for (unsigned NextStoreSize = 2;
NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
NextStoreSize *= 2) {
uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment(8 * NextStoreSize)
.value());
if (OffsetFromMaxAlign % StoreAlign == 0)
StoreSize = NextStoreSize;
unsigned NumPointerStores = Bytes / MaxIntSize;

// Assume the remaining bytes if any are done a byte at a time.
unsigned NumByteStores = Bytes % MaxIntSize;

// If we will reduce the # stores (according to this heuristic), do the
// transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
// etc.
return TheStores.size() > NumPointerStores+NumByteStores;
} else {
// Estimate the number of stores that would be used to implement the
// stores in the range after the DAG Combiner has merged any
// naturally-aligned stores.
//
// This takes account of partial alignment information, which would
// be discarded by converting to a memset. For example:
// struct A {
// char a, b, c, d, e, f, g, h;
// int counter;
// } *Ap;
// Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
//
// The overall structure alignment is 32-bits. Naively, we see 7
// single-byte stores, the first of which, b, is only known to be
// byte-aligned. However, since most architectures support 32-bit and
// 16-bit stores, these can be merged by DAGCombine into only 3
// naturally-aligned stores:
// store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
// store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
// store<(store (s32) into %ir.e)> t0, Constant:i32<0>...

int Offset = Start;
int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
int StoreCount = 0;

while (Offset < End) {
unsigned StoreSize = 1;
for (unsigned NextStoreSize = 2;
NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
NextStoreSize *= 2) {
uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment(8 * NextStoreSize)
.value());
if (OffsetFromMaxAlign % StoreAlign == 0)
StoreSize = NextStoreSize;
}
OffsetFromMaxAlign += StoreSize;
Offset += StoreSize;
StoreCount++;
}
OffsetFromMaxAlign += StoreSize;
Offset += StoreSize;
StoreCount++;
return StoreCount > 4;
}
return StoreCount > 4;
}

namespace {
Expand Down Expand Up @@ -265,7 +297,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
I->Start = Start;
I->StartPtr = Ptr;
I->Alignment = Alignment;
I->MaxAlignmentOffset = (I->MaxAlignmentOffset + Size) % I->MaxAlignment;
if (I->MaxAlignment != 0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. This now makes sense.

I->MaxAlignmentOffset = (I->MaxAlignmentOffset + Size) % I->MaxAlignment;
else
I->MaxAlignmentOffset = 0;
cme marked this conversation as resolved.
Show resolved Hide resolved
}

// Does this store provide a better alignment than we have
Expand Down Expand Up @@ -321,6 +356,7 @@ class MemCpyOptLegacyPass : public FunctionPass {
AU.addRequired<TargetLibraryInfoWrapperPass>();
if (!EnableMemorySSA)
AU.addRequired<MemoryDependenceWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<MemoryDependenceWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
Expand All @@ -346,6 +382,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
false, false)

Expand Down Expand Up @@ -524,7 +561,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
if (Range.TheStores.size() == 1) continue;

// If it is profitable to lower this range to memset, do so now.
if (!Range.isProfitableToUseMemset(DL))
if (!Range.isProfitableToUseMemset(DL, TTI, &StartInst->getContext()))
continue;

// Otherwise, we do want to transform this! Create a new memset.
Expand Down Expand Up @@ -1777,11 +1814,12 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *AA = &AM.getResult<AAManager>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
: AM.getCachedResult<MemorySSAAnalysis>(F);

bool MadeChange =
runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr, TTI);
if (!MadeChange)
return PreservedAnalyses::all();

Expand All @@ -1797,14 +1835,15 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
AssumptionCache *AC_, DominatorTree *DT_,
MemorySSA *MSSA_) {
MemorySSA *MSSA_, TargetTransformInfo *TTI_) {
bool MadeChange = false;
MD = MD_;
TLI = TLI_;
AA = AA_;
AC = AC_;
DT = DT_;
MSSA = MSSA_;
TTI = TTI_;
MemorySSAUpdater MSSAU_(MSSA_);
MSSAU = MSSA_ ? &MSSAU_ : nullptr;
// If we don't have at least memset and memcpy, there is little point of doing
Expand Down Expand Up @@ -1838,10 +1877,11 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *MSSAWP = EnableMemorySSA
? &getAnalysis<MemorySSAWrapperPass>()
: getAnalysisIfAvailable<MemorySSAWrapperPass>();

return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
MSSAWP ? &MSSAWP->getMSSA() : nullptr);
MSSAWP ? &MSSAWP->getMSSA() : nullptr, TTI);
}
31 changes: 31 additions & 0 deletions llvm/test/Transforms/MemCpyOpt/profitable-memset-aligned.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; Supported only on targets that don't have native misaligned access
; REQUIRES: mips-registered-target
; RUN: opt -mtriple mips < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
; RUN: opt -mtriple mips < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"

define void @foo(i64* nocapture %P) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[P:%.*]] to i16*
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32*
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 3
; CHECK-NEXT: store i16 0, i16* [[TMP0]], align 2
; CHECK-NEXT: store i32 0, i32* [[TMP1]], align 4
; CHECK-NEXT: store i16 0, i16* [[ARRAYIDX1]], align 2
; CHECK-NEXT: ret void
;
entry:
%0 = bitcast i64* %P to i16*
%arrayidx = getelementptr inbounds i16, i16* %0, i64 1
%1 = bitcast i16* %arrayidx to i32*
%arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3
store i16 0, i16* %0, align 2
store i32 0, i32* %1, align 4
store i16 0, i16* %arrayidx1, align 2
ret void
}

6 changes: 4 additions & 2 deletions llvm/test/Transforms/MemCpyOpt/profitable-memset.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
; Unsupported on targets that don't have misaligned access support
; REQUIRES: aarch64-registered-target
; RUN: opt -mtriple=aarch64 < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
; RUN: opt -mtriple=aarch64 < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"

Expand Down