Address PR review feedback: Revert global allocator, enhance planner

cafeTechne · cafeTechne · commit fd424224206e · 2025-11-28T00:58:53.000-05:00
- Reverted global allocator 256KB bucketing to prevent memory bloat
- Moved bucketing logic to AttentionMemoryPlanner (local scope)
- Implemented thread safety (mutex) in AttentionMemoryPlanner
- Switched to 'First Fit' reuse strategy to prevent metadata explosion
- Fixed unit test logic for workspace prediction
- Added unit tests for reuse and metadata stability
diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc
@@ -43,13 +43,8 @@ void* CUDAAllocator::Alloc(size_t size) {
   CheckDevice(true);
   void* p = nullptr;
   if (size > 0) {
-    // Heuristic H1: Bucket Allocations
-    // Round up to 256 KB to reduce fragmentation
-    constexpr size_t kBucketSize = 256 * 1024;
-    size_t bucketed_size = ((size + kBucketSize - 1) / kBucketSize) * kBucketSize;
-
     // BFCArena was updated recently to handle the exception and adjust the request size
-    CUDA_CALL_THROW(cudaMalloc((void**)&p, bucketed_size));
+    CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
   }
   return p;
 }
diff --git a/onnxruntime/core/providers/cuda/transformers/attention_memory_planner.cc b/onnxruntime/core/providers/cuda/transformers/attention_memory_planner.cc
@@ -15,36 +15,31 @@ size_t AttentionMemoryPlanner::PredictWorkspaceSize(int64_t batch_size, int64_t
     return std::min(kLimit, static_cast<size_t>(predicted * 1.25));
 }
 
-void* AttentionMemoryPlanner::Allocate(size_t size, const std::vector<int64_t>& shape) {
+void* AttentionMemoryPlanner::Allocate(size_t size) {
+    std::lock_guard<std::mutex> lock(mutex_);
     size_t bucketed_size = BucketSize(size);
     
-    // Heuristic H3: Tensor Lifetime Reuse
-    // Try to find a free block with exact shape match (preferred)
-    for (auto& alloc : allocations_) {
-        if (alloc.free && alloc.size >= bucketed_size) {
-             if (alloc.shape == shape) {
-                 alloc.free = false;
-                 return alloc.ptr;
-             }
-        }
-    }
+    // Heuristic H3: Tensor Lifetime Reuse (Best Fit / First Fit with size >= requested)
+    // We look for a free block that is large enough.
+    // Since we bucket, we are likely to find exact matches or slightly larger ones.
+    // We pick the first one that fits to avoid scanning the whole list (First Fit).
+    // Ideally we might want Best Fit, but First Fit is faster and usually sufficient with bucketing.
     
-    // Fallback: find any free block large enough
     for (auto& alloc : allocations_) {
         if (alloc.free && alloc.size >= bucketed_size) {
-            alloc.free = false;
-            alloc.shape = shape; 
-            return alloc.ptr;
+             alloc.free = false;
+             return alloc.ptr;
         }
     }
 
     // Allocate new
     void* p = allocator_->Alloc(bucketed_size);
-    allocations_.push_back({p, bucketed_size, false, shape});
+    allocations_.push_back({p, bucketed_size, false});
     return p;
 }
 
 void AttentionMemoryPlanner::Free(void* p) {
+    std::lock_guard<std::mutex> lock(mutex_);
     for (auto& alloc : allocations_) {
         if (alloc.ptr == p) {
             alloc.free = true;
diff --git a/onnxruntime/core/providers/cuda/transformers/attention_memory_planner.h b/onnxruntime/core/providers/cuda/transformers/attention_memory_planner.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <vector>
 #include <map>
+#include <mutex>
 #include "core/common/common.h"
 #include "core/framework/allocator.h"
 
@@ -13,25 +14,27 @@ class AttentionMemoryPlanner {
   AttentionMemoryPlanner(AllocatorPtr allocator, size_t stream_idx)
       : allocator_(allocator), stream_idx_(stream_idx) {}
 
-  void* Allocate(size_t size, const std::vector<int64_t>& shape);
+  void* Allocate(size_t size);
   void Free(void* p);
 
   static size_t PredictWorkspaceSize(int64_t batch_size, int64_t num_heads, int64_t seq_len, int64_t head_dim, size_t element_size);
 
  private:
   struct Allocation {
     void* ptr;
-    size_t size;
+    size_t size; // Actual allocated size (bucketed)
     bool free;
-    std::vector<int64_t> shape;
   };
 
   AllocatorPtr allocator_;
   size_t stream_idx_;
   std::vector<Allocation> allocations_;
+  std::mutex mutex_;
 
   size_t BucketSize(size_t size) const {
     constexpr size_t kBucketSize = 256 * 1024; // 256 KB
+    // Only bucket if size is large enough to matter, otherwise we waste too much on small tensors
+    if (size < kBucketSize) return size;
     return ((size + kBucketSize - 1) / kBucketSize) * kBucketSize;
   }
 };
diff --git a/onnxruntime/test/providers/cuda/attention_mem_tests.cc b/onnxruntime/test/providers/cuda/attention_mem_tests.cc
@@ -19,28 +19,67 @@ class MockAllocator : public IAllocator {
 };
 
 TEST(AttentionMemoryPlannerTest, PredictWorkspaceSize) {
+    // predicted = 1 * 32 * 1024 * 128 * 4 = 16,777,216 bytes = 16 MB
+    // limit = min(512MB, predicted * 1.25) = 16MB * 1.25 = 20 MB = 20,971,520 bytes
     size_t size = AttentionMemoryPlanner::PredictWorkspaceSize(1, 32, 1024, 128, 4);
-    // 1 * 32 * 1024 * 128 * 4 = 16,777,216 bytes = 16 MB
-    EXPECT_EQ(size, 16777216);
+    EXPECT_EQ(size, 20971520);
 }
 
-TEST(AttentionMemoryPlannerTest, AllocationReuse) {
+TEST(AttentionMemoryPlannerTest, AllocationReuse_BestFit) {
     auto allocator = std::make_shared<MockAllocator>();
     AttentionMemoryPlanner planner(allocator, 0);
 
-    std::vector<int64_t> shape1 = {1, 32, 1024, 128};
-    void* p1 = planner.Allocate(100, shape1);
+    // Allocate 1MB (will be bucketed to 1MB if bucket size is 256KB)
+    size_t size1 = 1024 * 1024; 
+    void* p1 = planner.Allocate(size1);
     
     planner.Free(p1);
     
-    void* p2 = planner.Allocate(100, shape1);
-    EXPECT_EQ(p1, p2); // Should reuse exact shape
+    // Allocate slightly smaller size, should reuse p1
+    size_t size2 = size1 - 1024;
+    void* p2 = planner.Allocate(size2);
+    EXPECT_EQ(p1, p2); // Should reuse the same pointer
     
     planner.Free(p2);
+}
+
+TEST(AttentionMemoryPlannerTest, MetadataStability_Autoregressive) {
+    auto allocator = std::make_shared<MockAllocator>();
+    AttentionMemoryPlanner planner(allocator, 0);
+
+    // Simulate autoregressive generation: seq_len increases, so buffer size increases
+    // We want to ensure we don't keep allocating new blocks without reusing old ones if they fit.
+    // Note: In a real scenario, we'd likely free the old smaller buffer and allocate a new larger one.
+    // If we free the old one, it becomes available.
+    
+    void* p_prev = nullptr;
+    
+    // Step 1: Allocate 100KB
+    void* p1 = planner.Allocate(100 * 1024);
+    p_prev = p1;
+    
+    // Step 2: Free p1, Allocate 110KB
+    // Since 100KB < 256KB bucket, it's not bucketed in our current logic (if size < kBucketSize return size).
+    // Wait, let's check the logic: "if (size < kBucketSize) return size;"
+    // So small allocations are exact.
+    
+    planner.Free(p1);
+    
+    // If we allocate larger, we can't reuse the smaller block.
+    void* p2 = planner.Allocate(110 * 1024);
+    EXPECT_NE(p1, p2); // Can't reuse smaller block for larger request
+    
+    planner.Free(p2);
+    
+    // Step 3: Large allocations (bucketed)
+    // Allocate 1MB
+    void* p3 = planner.Allocate(1024 * 1024);
+    planner.Free(p3);
     
-    std::vector<int64_t> shape2 = {1, 32, 1024, 64};
-    void* p3 = planner.Allocate(100, shape2);
-    EXPECT_EQ(p3, p2); // Should reuse size-compatible buffer
+    // Allocate 0.9MB (should reuse 1MB bucket)
+    void* p4 = planner.Allocate(900 * 1024);
+    EXPECT_EQ(p3, p4);
+    planner.Free(p4);
 }
 
 }