[analyzer] Add hack in ArrayBound to cover up missing casts

Currently there are many casts that are not modeled (i.e. ignored) by the analyzer, which can cause paradox states (e.g. negative value stored in `unsigned` variable) and false positive reports from various checkers, e.g. from `security.ArrayBound`. Unfortunately this issue is deeply rooted in the architectural limitations of the analyzer (if we started to model the casts, it would break other things). For details see the umbrella ticket #39492 This commit adds an ugly hack in `security.ArrayBound` to silence most of the false positives caused by this shortcoming of the engine. Fixes #126884
llvm · Feb 13, 2025 · d806869 · d806869
1 parent 8da8ff8
commit d806869
Show file tree

Hide file tree

Showing 2 changed files with 107 additions and 34 deletions.
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
@@ -34,24 +34,37 @@ using namespace taint;
 using llvm::formatv;
 
 namespace {
-/// If `E` is a "clean" array subscript expression, return the type of the
-/// accessed element. If the base of the subscript expression is modified by
-/// pointer arithmetic (and not the beginning of a "full" memory region), this
-/// always returns nullopt because that's the right (or the least bad) thing to
-/// do for the diagnostic output that's relying on this.
-static std::optional<QualType> determineElementType(const Expr *E,
-                                                    const CheckerContext &C) {
+/// If `E` is an array subscript expression with a base that is "clean" (= not
+/// modified by pointer arithmetic = the beginning of a memory region), return
+/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr.
+/// This helper function is used by two separate heuristics that are only valid
+/// in these "clean" cases.
+static const ArraySubscriptExpr *
+getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) {
   const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
   if (!ASE)
-    return std::nullopt;
+    return nullptr;
 
   const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
   if (!SubscriptBaseReg)
-    return std::nullopt;
+    return nullptr;
 
   // The base of the subscript expression is affected by pointer arithmetics,
-  // so we want to report byte offsets instead of indices.
+  // so we want to report byte offsets instead of indices and we don't want to
+  // activate the "index is unsigned -> cannot be negative" shortcut.
   if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
+    return nullptr;
+
+  return ASE;
+}
+
+/// If `E` is a "clean" array subscript expression, return the type of the
+/// accessed element; otherwise return std::nullopt because that's the best (or
+/// least bad) option for the diagnostic generation that relies on this.
+static std::optional<QualType> determineElementType(const Expr *E,
+                                                    const CheckerContext &C) {
+  const auto *ASE = getAsCleanArraySubscriptExpr(E, C);
+  if (!ASE)
     return std::nullopt;
 
   return ASE->getType();
@@ -140,7 +153,9 @@ class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>,
                                    ProgramStateRef ErrorState, NonLoc Val,
                                    bool MarkTaint);
 
-  static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
+  static bool isFromCtypeMacro(const Expr *E, ASTContext &AC);
+
+  static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C);
 
   static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
                                        NonLoc Offset, NonLoc Limit,
@@ -588,20 +603,48 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
 
     if (PrecedesLowerBound) {
-      // The offset may be invalid (negative)...
-      if (!WithinLowerBound) {
-        // ...and it cannot be valid (>= 0), so report an error.
-        Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
-        reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
-        return;
+      // The analyzer thinks that the offset may be invalid (negative)...
+
+      if (isOffsetObviouslyNonnegative(E, C)) {
+        // ...but the offset is obviously non-negative (clear array subscript
+        // with an unsigned index), so we're in a buggy situation.
+
+        // TODO: Currently the analyzer ignores many casts (e.g. signed ->
+        // unsigned casts), so it can easily reach states where it will load a
+        // signed (and negative) value from an unsigned variable. This sanity
+        // check is a duct tape "solution" that silences most of the ugly false
+        // positives that are caused by this buggy behavior. Note that this is
+        // not a complete solution: this cannot silence reports where pointer
+        // arithmetic complicates the picture and cannot ensure modeling of the
+        // "unsigned index is positive with highest bit set" cases which are
+        // "usurped" by the nonsense "unsigned index is negative" case.
+        // For more information about this topic, see the umbrella ticket
+        // https://github.com/llvm/llvm-project/issues/39492
+        // TODO: Remove this hack once 'SymbolCast's are modeled properly.
+
+        if (!WithinLowerBound) {
+          // The state is completely nonsense -- let's just sink it!
+          C.addSink();
+          return;
+        }
+        // Otherwise continue on the 'WithinLowerBound' branch where the
+        // unsigned index _is_ non-negative. Don't mention this assumption as a
+        // note tag, because it would just confuse the users!
+      } else {
+        if (!WithinLowerBound) {
+          // ...and it cannot be valid (>= 0), so report an error.
+          Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
+          reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
+          return;
+        }
+        // ...but it can be valid as well, so the checker will (optimistically)
+        // assume that it's valid and mention this in the note tag.
+        SUR.recordNonNegativeAssumption();
       }
-      // ...but it can be valid as well, so the checker will (optimistically)
-      // assume that it's valid and mention this in the note tag.
-      SUR.recordNonNegativeAssumption();
     }
 
     // Actually update the state. The "if" only fails in the extremely unlikely
-    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // case when compareValueToThreshold returns {nullptr, nullptr} because
     // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinLowerBound)
       State = WithinLowerBound;
@@ -661,7 +704,7 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
     }
 
     // Actually update the state. The "if" only fails in the extremely unlikely
-    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // case when compareValueToThreshold returns {nullptr, nullptr} because
     // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinUpperBound)
       State = WithinUpperBound;
@@ -726,8 +769,8 @@ void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState,
   C.emitReport(std::move(BR));
 }
 
-bool ArrayBoundChecker::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
-  SourceLocation Loc = S->getBeginLoc();
+bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) {
+  SourceLocation Loc = E->getBeginLoc();
   if (!Loc.isMacroID())
     return false;
 
@@ -745,6 +788,14 @@ bool ArrayBoundChecker::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
           (MacroName == "isupper") || (MacroName == "isxdigit"));
 }
 
+bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E,
+                                                     CheckerContext &C) {
+  const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C);
+  if (!ASE)
+    return false;
+  return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType();
+}
+
 bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
   ParentMapContext &ParentCtx = ACtx.getParentMapContext();
   do {

diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c
@@ -1,6 +1,7 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,security.ArrayBound,debug.ExprInspection -verify %s
 
 void clang_analyzer_value(int);
+void clang_analyzer_dump(int);
 
 // Tests doing an out-of-bounds access after the end of an array using:
 // - constant integer index
@@ -188,29 +189,50 @@ int test_cast_to_unsigned(signed char x) {
   if (x >= 0)
     return x;
   // FIXME: Here the analyzer ignores the signed -> unsigned cast, and manages to
-  // load a negative value from an unsigned variable. This causes an underflow
-  // report, which is an ugly false positive.
+  // load a negative value from an unsigned variable.
   // The underlying issue is tracked by Github ticket #39492.
   clang_analyzer_value(y); // expected-warning {{8s:{ [-128, -1] } }}
-  return table[y]; // expected-warning {{Out of bound access to memory preceding}}
+  // However, a hack in the ArrayBound checker suppresses the false positive
+  // underflow report that would be generated here.
+  return table[y]; // no-warning
 }
 
 int test_cast_to_unsigned_overflow(signed char x) {
   unsigned char y = x;
   if (x >= 0)
     return x;
-  // A variant of 'test_cast_to_unsigned' where the correct behavior would be
-  // an overflow report (because the negative values are cast to `unsigned
-  // char` values that are too large).
-  // FIXME: See comment in 'test_cast_to_unsigned'.
+  // FIXME: As in 'test_cast_to_unsigned', the analyzer thinks that this
+  // unsigned variable contains a negative value.
   clang_analyzer_value(y); // expected-warning {{8s:{ [-128, -1] } }}
-  return small_table[y]; // expected-warning {{Out of bound access to memory preceding}}
+  // FIXME: The following subscript expression should produce an overflow
+  // report (because negative signed char corresponds to unsigned char >= 128);
+  // but the hack in ArrayBound just silences reports and cannot "restore" the
+  // real execution paths.
+  return small_table[y]; // no-warning
+}
+
+int test_cast_to_unsigned_with_ptr_arith(signed char x) {
+  unsigned char y = x;
+  int *p = table - 10;
+
+  if (x >= 0)
+    return x;
+  // FIXME: As in 'test_cast_to_unsigned', the analyzer thinks that this
+  // unsigned variable contains a negative value.
+  clang_analyzer_value(y); // expected-warning {{8s:{ [-128, -1] } }}
+  // In this case, the hack in ArrayBound cannot suppress the false positive
+  // underflow report because 'p' is not the beginning of a memory region, so
+  // it cannot clearly say that an unsigned index guarantees no overflow.
+  // However, we still don't get an underflow report because (for some unclear
+  // reason) the analyzer fails to evaluate this more complex expression.
+  clang_analyzer_dump(p[y]); // expected-warning {{Unknown}}
+  return p[y]; // no-warning
+
 }
 
 int test_negative_offset_with_unsigned_idx(void) {
   // An example where the subscript operator uses an unsigned index, but the
-  // underflow report is still justified. (We should try to keep this if we
-  // silence false positives like the one in 'test_cast_to_unsigned'.)
+  // underflow report is still justified.
   int *p = table - 10;
   unsigned idx = 2u;
   return p[idx]; // expected-warning {{Out of bound access to memory preceding}}