8000 [CGP] [CodeGenPrepare] Folding `urem` with loop invariant value plus offset by goldsteinn · Pull Request #104724 · llvm/llvm-project · GitHub
[go: up one dir, main page]

Skip to content

[CGP] [CodeGenPrepare] Folding urem with loop invariant value plus offset #104724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 31, 2024

Conversation

goldsteinn
Copy link
Contributor

This extends the existing fold:

for(i = Start; i < End; ++i)
   Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant;

->

Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;

To work with a non-zero IncrLoopInvariant.

This is a common usage in cases such as:

for(i = 0; i < N; ++i)
    if ((i + 1) % X) == 0)
        do_something_occasionally_but_not_first_iter();

Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout):
https://alive2.llvm.org/ce/z/6tgyN3

Exhaust proof over all uint8_t combinations in C++:
https://godbolt.org/z/WYa561388

@llvmbot
Copy link
Member
llvmbot commented Aug 18, 2024

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-transforms

Author: None (goldsteinn)

Changes

This extends the existing fold:

for(i = Start; i &lt; End; ++i)
   Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant;

->

Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i &lt; End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;

To work with a non-zero IncrLoopInvar 8000 iant.

This is a common usage in cases such as:

for(i = 0; i &lt; N; ++i)
    if ((i + 1) % X) == 0)
        do_something_occasionally_but_not_first_iter();

Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout):
https://alive2.llvm.org/ce/z/6tgyN3

Exhaust proof over all uint8_t combinations in C++:
https://godbolt.org/z/WYa561388


Full diff: https://github.com/llvm/llvm-project/pull/104724.diff

2 Files Affected:

  • (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+64-8)
  • (modified) llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll (+8-4)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 72db1f4adabf76..2ad56031acf992 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1976,17 +1976,43 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
   return true;
 }
 
-static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
-                                                  const LoopInfo *LI,
-                                                  Value *&RemAmtOut,
-                                                  PHINode *&LoopIncrPNOut) {
+static bool isRemOfLoopIncrementWithLoopInvariant(
+    Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut,
+    std::optional<bool> &AddOrSubOut, Value *&AddOrSubInstOut,
+    Value *&AddOrSubOffsetOut, PHINode *&LoopIncrPNOut) {
   Value *Incr, *RemAmt;
   // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
   if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
     return false;
 
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset;
   // Find out loop increment PHI.
   auto *PN = dyn_cast<PHINode>(Incr);
+  if (PN != nullptr) {
+    AddOrSub = std::nullopt;
+    AddOrSubOffset = nullptr;
+  } else {
+    // Search through a NUW add/sub on top of the loop increment.
+    Value *V0, *V1;
+    if (match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1))))
+      AddOrSub = true;
+    else if (match(Incr, m_NUWSub(m_Value(V0), m_Value(V1))))
+      AddOrSub = false;
+    else
+      return false;
+
+    AddOrSubInstOut = Incr;
+
+    PN = dyn_cast<PHINode>(V0);
+    if (PN != nullptr) {
+      AddOrSubOffset = V1;
+    } else if (*AddOrSub) {
+      PN = dyn_cast<PHINode>(V1);
+      AddOrSubOffset = V0;
+    }
+  }
+
   if (!PN)
     return false;
 
@@ -2027,6 +2053,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
   // Set output variables.
   RemAmtOut = RemAmt;
   LoopIncrPNOut = PN;
+  AddOrSubOut = AddOrSub;
+  AddOrSubOffsetOut = AddOrSubOffset;
 
   return true;
 }
@@ -2041,15 +2069,15 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
 // Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
 // for(i = Start; i < End; ++i, ++rem)
 //    Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
-//
-// Currently only implemented for `IncrLoopInvariant` being zero.
 static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
                                     const LoopInfo *LI,
                                     SmallSet<BasicBlock *, 32> &FreshBBs,
                                     bool IsHuge) {
-  Value *RemAmt;
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset, *RemAmt, *AddOrSubInst;
   PHINode *LoopIncrPN;
-  if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN))
+  if (!isRemOfLoopIncrementWithLoopInvariant(
+          Rem, LI, RemAmt, AddOrSub, AddOrSubInst, AddOrSubOffset, LoopIncrPN))
     return false;
 
   // Only non-constant remainder as the extra IV is probably not profitable
@@ -2066,7 +2094,33 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
     return false;
   Loop *L = LI->getLoopFor(Rem->getParent());
 
+  // If we have add/sub create initial value for remainder.
+  // The logic here is:
+  // (urem (add/sub nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
+  //
+  // Only proceed if the expression simplifies (otherwise we can't fully
+  // optimize out the urem).
   Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader());
+  if (AddOrSub) {
+    assert(AddOrSubOffset && AddOrSubInst &&
+           "We found an add/sub but missing values");
+    // Without dom-condition/assumption cache we aren't likely to get much out
+    // of a context instruction.
+    const SimplifyQuery Q(*DL);
+    bool NSW = cast<OverflowingBinaryOperator>(AddOrSubInst)->hasNoSignedWrap();
+    if (*AddOrSub)
+      Start = simplifyAddInst(Start, AddOrSubOffset, /*IsNSW=*/NSW,
+                              /*IsNUW=*/true, Q);
+    else
+      Start = simplifySubInst(Start, AddOrSubOffset, /*IsNSW=*/NSW,
+                              /*IsNUW=*/true, Q);
+    if (!Start)
+      return false;
+
+    Start = simplifyURemInst(Start, RemAmt, Q);
+    if (!Start)
+      return false;
+  }
 
   // Create new remainder with induction variable.
   Type *Ty = Rem->getType();
@@ -2093,6 +2147,8 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
 
   replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
   Rem->eraseFromParent();
+  if (AddOrSubInst && AddOrSubInst->use_empty())
+    cast<Instruction>(AddOrSubInst)->eraseFromParent();
   return true;
 }
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index 304ae337ed4197..61bbe08e293b39 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -706,10 +706,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -817,10 +819,12 @@ define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt,
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = sub nuw i32 [[I_04]], [[START]]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]

@goldsteinn goldsteinn force-pushed the goldsteinn/cgp-urem-liv-offset branch from 7c66607 to 77f5b58 Compare August 20, 2024 22:48
// Without dom-condition/assumption cache we aren't likely to get much out
// of a context instruction.
const SimplifyQuery Q(*DL);
bool NSW = cast<OverflowingBinaryOperator>(AddOrSubInst)->hasNoSignedWrap();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will assert if AddOrSubInst is a disjoint or.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed + Tests.

@goldsteinn goldsteinn force-pushed the goldsteinn/cgp-urem-liv-offset branch from 77f5b58 to 191df4d Compare August 21, 2024 18:30
@nikic
Copy link
Contributor
nikic commented Aug 21, 2024

@dtcxzyw Does this pattern exist?

I can imagine the add variant might, but sub and or disjoint are unnecessary generalization.

@dtcxzyw
Copy link
Member
dtcxzyw commented Aug 22, 2024

@dtcxzyw Does this pattern exist?

Add: abc/giaSimBase.c.ll
DisjointOr: none
Sub: none

I can imagine the add variant might, but sub and or disjoint are unnecessary generalization.

Agree.

@goldsteinn
Copy link
Contributor Author

@dtcxzyw Does this pattern exist?

Add: abc/giaSimBase.c.ll DisjointOr: none Sub: none

I can imagine the add variant might, but sub and or disjoint are unnecessary generalization.

Agree.

If I change the simplification code to just use simplifyBinOp is the minimal complexity okay to keep or disjoint (and potentially sub)?

Dropping in meantime.

@goldsteinn goldsteinn force-pushed the goldsteinn/cgp-urem-liv-offset branch from 191df4d to 71b83ed Compare August 22, 2024 17:40
Copy link
github-actions bot commented Aug 22, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

…ffset

This extends the existing fold:

```
for(i = Start; i < End; ++i)
   Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant;
```
 ->
```
Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
```

To work with a non-zero `IncrLoopInvariant`.

This is a common usage in cases such as:

```
for(i = 0; i < N; ++i)
    if ((i + 1) % X) == 0)
        do_something_occasionally_but_not_first_iter();
```

Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout):
https://alive2.llvm.org/ce/z/6tgyN3

Exhaust proof over all uint8_t combinations in C++:
https://godbolt.org/z/WYa561388
@goldsteinn goldsteinn force-pushed the goldsteinn/cgp-urem-liv-offset branch from 71b83ed to f8ec02f Compare September 19, 2024 01:20
@goldsteinn
Copy link
Contributor Author

ping

} else {
// Search through a NUW add on top of the loop increment.
Value *V0, *V1;
if (!match(Incr, m_NUWAdd(m_Value(V0), m_Value(V1))))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be worth adding a m_PHI matcher - we could then use m_c_NUWAdd and avoid the messy commutative matching below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could, although Im not sure exactly what we would want in a PHI matcher? Would we support matching BBs? Variable number of arguments or "binary" phis only? Etc...
Either way, I think that its a conversation that is a bit beyond the scope of this particular use case and PR.

@goldsteinn
Copy link
Contributor Author

ping.

Copy link
Collaborator
@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM but @dtcxzyw needs to sign off as well

Copy link
Member
@dtcxzyw dtcxzyw left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LG

@goldsteinn goldsteinn merged commit 1e072ae into llvm:main Oct 31, 2024
8 checks passed
@llvm-ci
Copy link
Collaborator
llvm-ci commented Oct 31, 2024

LLVM Buildbot has detected a new failure on builder clangd-ubuntu-tsan running on clangd-ubuntu-clang while building llvm at step 6 "test-build-clangd-clangd-index-server-clangd-indexer-check-clangd".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/134/builds/7812

Here is the relevant piece of the build log for the reference
Step 6 (test-build-clangd-clangd-index-server-clangd-indexer-check-clangd) failure: test (failure)
******************** TEST 'Clangd :: target_info.test' FAILED ********************
Exit Code: 66

Command Output (stderr):
--
RUN: at line 5: rm -rf /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir && mkdir -p /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
+ rm -rf /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
+ mkdir -p /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
RUN: at line 7: echo '[{"directory": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir", "command": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/armv7-clang -x c++ the-file.cpp -v", "file": "the-file.cpp"}]' > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/compile_commands.json
+ echo '[{"directory": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir", "command": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/armv7-clang -x c++ the-file.cpp -v", "file": "the-file.cpp"}]'
RUN: at line 9: sed -e "s|INPUT_DIR|/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir|g" /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/llvm-project/clang-tools-extra/clangd/test/target_info.test > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1
+ sed -e 's|INPUT_DIR|/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/llvm-project/clang-tools-extra/clangd/test/target_info.test
RUN: at line 12: sed -E -e 's|"file://([A-Z]):/|"file:///\1:/|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1 > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test
+ sed -E -e 's|"file://([A-Z]):/|"file:///\1:/|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1
RUN: at line 14: clangd -lit-test < /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test 2>&1 | /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/bin/FileCheck -strict-whitespace /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test
+ clangd -lit-test
+ /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/bin/FileCheck -strict-whitespace /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test

--

********************


smallp-o-p pushed a commit to smallp-o-p/llvm-project that referenced this pull request Nov 3, 2024
…offset (llvm#104724)

This extends the existing fold:

```
for(i = Start; i < End; ++i)
   Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant;
```
 ->
```
Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
```

To work with a non-zero `IncrLoopInvariant`.

This is a common usage in cases such as:

```
for(i = 0; i < N; ++i)
    if ((i + 1) % X) == 0)
        do_something_occasionally_but_not_first_iter();
```

Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout):
https://alive2.llvm.org/ce/z/6tgyN3

Exhaust proof over all uint8_t combinations in C++:
https://godbolt.org/z/WYa561388
NoumanAmir657 pushed a commit to NoumanAmir657/llvm-project that referenced this pull request Nov 4, 2024
…offset (llvm#104724)

This extends the existing fold:

```
for(i = Start; i < End; ++i)
   Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant;
```
 ->
```
Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
```

To work with a non-zero `IncrLoopInvariant`.

This is a common usage in cases such as:

```
for(i = 0; i < N; ++i)
    if ((i + 1) % X) == 0)
        do_something_occasionally_but_not_first_iter();
```

Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout):
https://alive2.llvm.org/ce/z/6tgyN3

Exhaust proof over all uint8_t combinations in C++:
https://godbolt.org/z/WYa561388
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants
0