@@ -259,6 +259,42 @@ struct KvCacheStats
259259 std::size_t allocatedBytes{};
260260};
261261
262+ // / @brief Per-iteration KV cache statistics. All delta counters represent changes since the last call to
263+ // / getIterationStats(). Gauges are instantaneous snapshots.
264+ struct KvCacheIterationStats
265+ {
266+ // --- Instantaneous gauges ---
267+ // Primary (GPU) pool
268+ SizeType32 primaryMaxNumBlocks{0 };
269+ SizeType32 primaryFreeNumBlocks{0 };
270+ SizeType32 primaryUsedNumBlocks{0 };
271+ // Secondary (host) pool
272+ SizeType32 secondaryMaxNumBlocks{0 };
273+ SizeType32 secondaryFreeNumBlocks{0 };
274+ SizeType32 secondaryUsedNumBlocks{0 };
275+
276+ // --- Per-iteration deltas (reset on each read) ---
277+ // Context phase: block allocation and reuse
278+ SizeType32 iterAllocTotalBlocks{0 };
279+ SizeType32 iterAllocNewBlocks{0 };
280+ SizeType32 iterReusedBlocks{0 }; // = iterFullReusedBlocks + iterPartialReusedBlocks
281+ SizeType32 iterFullReusedBlocks{0 }; // blocks fully matched in radix tree
282+ SizeType32 iterPartialReusedBlocks{0 }; // blocks partially matched in radix tree
283+ SizeType32 iterMissedBlocks{0 };
284+ float iterCacheHitRate{0 .0f };
285+ // Generation phase: block allocation
286+ SizeType32 iterGenAllocBlocks{0 };
287+
288+ // Transfer traffic deltas — host ↔ GPU
289+ SizeType32 iterOnboardBlocks{0 };
290+ std::size_t iterOnboardBytes{0 };
291+ SizeType32 iterOffloadBlocks{0 };
292+ std::size_t iterOffloadBytes{0 };
293+ // Intra-device (GPU → GPU) block copies (e.g. partial reuse when source block has refs)
294+ SizeType32 iterIntraDeviceCopyBlocks{0 };
295+ std::size_t iterIntraDeviceCopyBytes{0 };
296+ };
297+
262298// Basic building block of a paged KV cache - a single
263299// cache block. This class just holds metadata, no pointers
264300// since it is reused across all layers.
@@ -815,6 +851,12 @@ class WindowBlockManager
815851 return mMissedBlocks ;
816852 }
817853
854+ // Get num free blocks in the secondary (host) memory pool
855+ [[nodiscard]] SizeType32 getNumFreeSecondaryBlocks () const noexcept ;
856+
857+ // ! \brief Get iteration stats (deltas since last call) for this window. Resets internal delta snapshots.
858+ [[nodiscard]] KvCacheIterationStats getAndResetIterationStats ();
859+
818860 [[nodiscard]] bool hasFreeBlocks (SizeType32 numRequired = 1 ) const
819861 {
820862 return getNumFreeBlocks () >= numRequired;
@@ -1128,16 +1170,22 @@ class WindowBlockManager
11281170 std::shared_ptr<KVCacheTransferManager> mTransferManager ;
11291171
11301172 // Statistics for block allocations/reuse
1131- // Total number of blocks allocated by all requests
1173+ // Total number of blocks allocated by all requests (context phase)
11321174 SizeType32 mAllocTotalBlocks ;
1133- // Number of new blocks that were allocated
1175+ // Number of new blocks that were allocated (context phase)
11341176 SizeType32 mAllocNewBlocks ;
1135- // Number of blocks that were reused
1177+ // Number of blocks that were fully reused (context phase)
1178+ SizeType32 mFullReusedBlocks ;
1179+ // Number of blocks that were partially reused (context phase)
1180+ SizeType32 mPartialReusedBlocks ;
1181+ // Number of blocks that were reused (full + partial, context phase)
11361182 SizeType32 mReusedBlocks ;
11371183 // Number of unique blocks that were reused
11381184 SizeType32 mReusedUniqueBlocks ;
1139- // Number of blocks that were not reused
1185+ // Number of blocks that were not reused (context phase)
11401186 SizeType32 mMissedBlocks ;
1187+ // Number of blocks allocated during generation phase
1188+ SizeType32 mGenAllocBlocks ;
11411189 // Only be 1 or 2. If 2: general KV stored. If 1: K == V for any token, so only K is stored to optimize the
11421190 // max_num_tokens(For DeepSeek). Controlled by mCacheType
11431191 SizeType32 mKVFactor ;
@@ -1154,6 +1202,15 @@ class WindowBlockManager
11541202 // The kv cache connector manager
11551203 std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager ;
11561204
1205+ // Snapshot of cumulative counters at last iteration stats read (for delta computation)
1206+ SizeType32 mPrevAllocTotalBlocks {0 };
1207+ SizeType32 mPrevAllocNewBlocks {0 };
1208+ SizeType32 mPrevReusedBlocks {0 };
1209+ SizeType32 mPrevFullReusedBlocks {0 };
1210+ SizeType32 mPrevPartialReusedBlocks {0 };
1211+ SizeType32 mPrevMissedBlocks {0 };
1212+ SizeType32 mPrevGenAllocBlocks {0 };
1213+
11571214 // Mutex for the cached blocks root
11581215 mutable std::mutex mCachedBlocksRootMutex ;
11591216
@@ -1359,6 +1416,19 @@ class BlockManager
13591416 return sumWindows ([](auto const & manager) { return manager.getNumMissedBlocks (); });
13601417 }
13611418
1419+ [[nodiscard]] SizeType32 getNumSecondaryBlocks () const
1420+ {
1421+ return sumWindows ([](auto const & manager) { return manager.getNumSecondaryBlocks (); });
1422+ }
1423+
1424+ [[nodiscard]] SizeType32 getNumFreeSecondaryBlocks () const
1425+ {
1426+ return sumWindows ([](auto const & manager) { return manager.getNumFreeSecondaryBlocks (); });
1427+ }
1428+
1429+ // ! \brief Get per-window-size iteration stats. Resets delta snapshots for each window.
1430+ [[nodiscard]] std::map<SizeType32, KvCacheIterationStats> getAndResetIterationStats ();
1431+
13621432 [[nodiscard]] SizeType32 getNumLayers () const
13631433 {
13641434 return mNumLayers ;
@@ -1688,6 +1758,10 @@ class BaseKVCacheManager
16881758
16891759 [[nodiscard]] virtual KvCacheStats getKvCacheStats () const = 0;
16901760
1761+ // ! \brief Get per-iteration stats with delta counters, keyed by window size.
1762+ // ! Resets delta snapshots on each call.
1763+ [[nodiscard]] virtual std::map<SizeType32, KvCacheIterationStats> getIterationStats () = 0;
1764+
16911765 [[nodiscard]] virtual OffsetTableDimensions getOffsetTableDimensions () const = 0;
16921766
16931767 [[nodiscard]] virtual std::deque<executor::KVCacheEvent> getLatestEvents (
@@ -2046,6 +2120,11 @@ class KVCacheManager : public BaseKVCacheManager
20462120 return kvCacheStats;
20472121 }
20482122
2123+ [[nodiscard]] std::map<SizeType32, KvCacheIterationStats> getIterationStats () override
2124+ {
2125+ return mBlockManager .getAndResetIterationStats ();
2126+ }
2127+
20492128 [[nodiscard]] OffsetTableDimensions getOffsetTableDimensions () const override
20502129 {
20512130 OffsetTableDimensions dims;
0 commit comments