Skip to content

Add memory::madvise::will_need_multiple_pages()#7124

Merged
xzfc merged 1 commit intodevfrom
will_need_multiple_pages
Aug 25, 2025
Merged

Add memory::madvise::will_need_multiple_pages()#7124
xzfc merged 1 commit intodevfrom
will_need_multiple_pages

Conversation

@xzfc
Copy link
Copy Markdown
Member

@xzfc xzfc commented Aug 25, 2025

This PR adds will_need_multiple_pages with the following doc/signature, and enables it in the GraphLinksView::links_with_vectors iterator.

/// Trigger readahead for a memory-mapped region by calling
/// `madvise(MADV_WILLNEED)` on it.
///
/// Use-case: the `region` is inside `MADV_RANDOM` memory map, but it spans
/// across more than one 4KiB page. If you read it in sequence, it will cause
/// multiple page faults, thus multiple 4KiB I/O operations. Avoid this by
/// calling this function before reading the region. It will prefetch the whole
/// region in a single I/O operation. (if possible)
///
/// Note: if the region fits within a single page, this function is a no-op.
#[cfg(unix)]
pub fn will_need_multiple_pages(region: &[u8]) {

I've experimented with benchmarking various approaches using pread, posix_fadvise, madvise, readahead. All of them show similar performance (compared to raw access to mmap), so I choose madvise as it doesn't require file descriptors (and we don't store them for mmaps).

benchmark.c
#define _GNU_SOURCE
#define _FILE_OFFSET_BITS 64

#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>

#define EPOCHS 30
#define ITERATIONS_PER_EPOCH 2000
#define BLOCK_SIZE 2048
#define PAGE_SIZE 4096

#define CHECK(cond)                                                            \
  do {                                                                         \
    if (!(cond)) {                                                             \
      fprintf(stderr, "CHECK failed at line %d: %s\n", __LINE__, #cond);       \
      perror("Error");                                                         \
      exit(1);                                                                 \
    }                                                                          \
  } while (0)

int main(int argc, char *argv[]) {
  int mode = -1;

  struct {
    const char *name;
    const char *desc;
  } modes[] = {{"pread", "pread system calls"},
               {"mmap", "memory mapped access"},
               {"prefetch", "mmap + pread prefetch"},
               {"optprefetch", "mmap + optimized pread prefetch"},
               {"fadvise", "mmap + posix_fadvise WILLNEED"},
               {"readahead", "mmap + readahead"},
               {"madvise", "mmap + madvise WILLNEED"}};

  if (argc != 3) {
    fprintf(stderr, "Usage: %s <mode> <file>\nModes:\n", argv[0]);
    for (int i = 0; i < 7; i++) {
      fprintf(stderr, "  %s - %s\n", modes[i].name, modes[i].desc);
    }
    exit(1);
  }

  for (int i = 0; i < 7; i++) {
    if (strcmp(argv[1], modes[i].name) == 0) {
      mode = i;
      break;
    }
  }

  CHECK(mode >= 0);
  printf("Using %s for random access.\n", modes[mode].desc);

  int fd = open(argv[2], O_RDONLY);
  CHECK(fd != -1);
  CHECK(posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED) == 0);

  struct stat st;
  CHECK(fstat(fd, &st) != -1);

  size_t file_size = st.st_size;
  CHECK(file_size >= BLOCK_SIZE);

  void *mapped = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
  CHECK(mapped != MAP_FAILED);

  void *buffer = malloc(BLOCK_SIZE);
  CHECK(buffer);

  CHECK(madvise(mapped, file_size, MADV_DONTNEED) == 0);
  CHECK(madvise(mapped, file_size, MADV_RANDOM) == 0);

  srand(time(NULL));

  double epoch_times[EPOCHS];

  for (int epoch = 0; epoch < EPOCHS; epoch++) {
    struct timespec start, end;
    CHECK(clock_gettime(CLOCK_MONOTONIC, &start) == 0);

    for (int i = 0; i < ITERATIONS_PER_EPOCH; i++) {
      size_t max_offset = file_size - BLOCK_SIZE;
      size_t offset = rand() % max_offset;

      uint64_t *data;
      switch (mode) {
      case 0: // pread
        CHECK(pread(fd, buffer, BLOCK_SIZE, offset) == BLOCK_SIZE);
        data = (uint64_t *)buffer;
        break;
      case 1: // mmap
        data = (uint64_t *)((char *)mapped + offset);
        break;
      case 2: // prefetch
        CHECK(pread(fd, buffer, BLOCK_SIZE, offset) == BLOCK_SIZE);
        data = (uint64_t *)((char *)mapped + offset);
        break;
      case 3: { // optprefetch
        size_t range_end = offset + BLOCK_SIZE - 1;
        size_t first_page = offset & ~(PAGE_SIZE - 1);
        size_t last_page = range_end & ~(PAGE_SIZE - 1);

        if (first_page != last_page) {
          size_t prefetch_start = first_page + PAGE_SIZE - 1;
          size_t prefetch_end = last_page + 1;
          size_t prefetch_size = prefetch_end - prefetch_start;
          CHECK(pread(fd, buffer, prefetch_size, prefetch_start) ==
                prefetch_size);
        }

        data = (uint64_t *)((char *)mapped + offset);
        break;
      }
      case 4: // fadvise
        CHECK(posix_fadvise(fd, offset, BLOCK_SIZE, POSIX_FADV_WILLNEED) == 0);
        data = (uint64_t *)((char *)mapped + offset);
        break;
      case 5: // readahead
        CHECK(readahead(fd, offset, BLOCK_SIZE) == 0);
        data = (uint64_t *)((char *)mapped + offset);
        break;
      case 6: { // madvise
        size_t page_start = offset & ~(PAGE_SIZE - 1);
        size_t page_end =
            (offset + BLOCK_SIZE + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
        CHECK(madvise((char *)mapped + page_start, page_end - page_start,
                      MADV_WILLNEED) == 0);
        data = (uint64_t *)((char *)mapped + offset);
        break;
      }
      }

      uint64_t sum = 0;
      for (int j = 0; j < BLOCK_SIZE / sizeof(uint64_t); j++)
        sum += data[j];

      __asm__ volatile("" : : "r"(sum) : "memory");
    }

    CHECK(clock_gettime(CLOCK_MONOTONIC, &end) == 0);

    double elapsed =
        (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    epoch_times[epoch] = (elapsed / ITERATIONS_PER_EPOCH) * 1e6;
  }

  int compare_double(const void *a, const void *b) {
    double da = *(const double *)a;
    double db = *(const double *)b;
    return (da > db) - (da < db);
  }

  qsort(epoch_times, EPOCHS, sizeof(double), compare_double);

  close(fd);
  printf("Median time per random access: %.2f microseconds\n",
         epoch_times[EPOCHS / 2]);
}

I've benchmarked average search time with this change on my hnsw-with-vector branch (not published yet). Parameters: m=16, 2-bit binary quantization, LAION 400M dataset.

ef=limit=32; madvise gives -0.19s; rescore gives +0.24s
           | no madvise | madvise  |
no rescore | 0.62       | 0.431182 |
rescore    | 0.868034   | 0.669058 |


ef=limit=64; madvise gives -0.33s; rescore gives +0.45s
           | no madvise | madvise  |
no rescore | 1.05       | 0.715974 |
rescore    | 1.50       | 1.166108 |

Perhaps later we can also use this function when accessing regular non-quantized vectors from the storage.

@xzfc xzfc requested review from generall and timvisee August 25, 2025 01:05
@coderabbitai
Copy link
Copy Markdown
Contributor

coderabbitai Bot commented Aug 25, 2025

📝 Walkthrough

Walkthrough

  • Cargo.toml: Expanded nix dependency features from ["fs"] to ["fs", "feature"] in workspace.dependencies.
  • lib/common/memory/src/madvise.rs: Added will_need_multiple_pages(region: &[u8]) on Unix that aligns to page boundaries, computes covered length, and calls madvise(MADV_WILLNEED) for multi-page regions; panics in debug on error. Introduced LazyLock-cached page size mask via get_page_mask(), with validation and warning on failure. Added no-op stub for non-Unix.
  • lib/segment/src/index/hnsw_index/graph_links/view.rs: Invokes memory::madvise::will_need_multiple_pages on the neighbors slice in the CompressedWithVectors path within links_with_vectors.

Estimated code review effort

🎯 3 (Moderate) | ⏱️ ~25 minutes

Suggested reviewers

  • timvisee
  • coszio
  • generall

Tip

🔌 Remote MCP (Model Context Protocol) integration is now available!

Pro plan users can now connect to remote MCP servers from the Integrations page. Connect with popular remote MCPs such as Notion and Linear to add more context to your reviews and chats.

✨ Finishing Touches
  • 📝 Generate Docstrings
🧪 Generate unit tests
  • Create PR with unit tests
  • Post copyable unit tests in a comment
  • Commit unit tests in branch will_need_multiple_pages

Thanks for using CodeRabbit! It's free for OSS, and your support helps us grow. If you like it, consider giving us a shout-out.

❤️ Share
🪧 Tips

Chat

There are 3 ways to chat with CodeRabbit:

  • Review comments: Directly reply to a review comment made by CodeRabbit. Example:
    • I pushed a fix in commit <commit_id>, please review it.
    • Open a follow-up GitHub issue for this discussion.
  • Files and specific lines of code (under the "Files changed" tab): Tag @coderabbitai in a new review comment at the desired location with your query.
  • PR comments: Tag @coderabbitai in a new PR comment to ask questions about the PR branch. For the best results, please provide a very specific query, as very limited context is provided in this mode. Examples:
    • @coderabbitai gather interesting stats about this repository and render them as a table. Additionally, render a pie chart showing the language distribution in the codebase.
    • @coderabbitai read the files in the src/scheduler package and generate a class diagram using mermaid and a README in the markdown format.

Support

Need help? Create a ticket on our support page for assistance with any issues or questions.

CodeRabbit Commands (Invoked using PR/Issue comments)

Type @coderabbitai help to get the list of available commands.

Other keywords and placeholders

  • Add @coderabbitai ignore anywhere in the PR description to prevent this PR from being reviewed.
  • Add @coderabbitai summary to generate the high-level summary at a specific location in the PR description.
  • Add @coderabbitai anywhere in the PR title to generate the title automatically.

CodeRabbit Configuration File (.coderabbit.yaml)

  • You can programmatically configure CodeRabbit by adding a .coderabbit.yaml file to the root of your repository.
  • Please see the configuration documentation for more information.
  • If your editor has YAML language server enabled, you can add the path at the top of this file to enable auto-completion and validation: # yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json

Status, Documentation and Community

  • Visit our Status Page to check the current availability of CodeRabbit.
  • Visit our Documentation for detailed information on how to use CodeRabbit.
  • Join our Discord Community to get help, request features, and share feedback.
  • Follow us on X/Twitter for updates and announcements.

Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai Bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actionable comments posted: 0

🧹 Nitpick comments (5)
lib/common/memory/src/madvise.rs (4)

193-196: Off-by-one in single-page check (causes unnecessary madvise at exact page boundary)

When offset_in_page + region.len() == page_size, the region still fits within a single page, but the current condition triggers madvise. Harmless, but contrary to the doc promise and adds syscall overhead.

Apply this diff:

-    if length <= page_mask {
+    let page_size = page_mask + 1;
+    if length <= page_size {
         // Data fits within a single page, do nothing.
         return;
     }

176-183: Doc wording: avoid hardcoding 4KiB; use “system page size”

Some targets (e.g., AArch64) commonly use 64KiB pages. Suggest making the docs platform-neutral.

-/// Use-case: the `region` is inside `MADV_RANDOM` memory map, but it spans
-/// across more than one 4KiB page. If you read it in sequence, it will cause
+/// Use-case: the `region` is inside a `MADV_RANDOM` memory map, but it spans
+/// across more than one OS page (page size is system-dependent). If you read it in sequence, it will cause
-/// multiple page faults, thus multiple 4KiB I/O operations. Avoid this by
+/// multiple page faults, thus multiple page-sized I/O operations. Avoid this by
 /// calling this function before reading the region. It will prefetch the whole
 /// region in a single I/O operation. (if possible)

201-208: Consider tracing errors in release builds (debug panics are fine)

In debug you panic on madvise failure; in release failures are silently ignored. A low-verbosity log (trace/debug) would help diagnose unexpected performance regressions without panicking.

     let res = unsafe { nix::libc::madvise(addr as *mut _, length, nix::libc::MADV_WILLNEED) };
     if res != 0 {
         #[cfg(debug_assertions)]
         {
             let err = io::Error::last_os_error();
             panic!("Failed to call madvise(MADV_WILLNEED): {err}");
         }
+        #[cfg(not(debug_assertions))]
+        {
+            let err = io::Error::last_os_error();
+            log::trace!("madvise(MADV_WILLNEED) failed (ignored): {err}");
+        }
     }

214-232: Minor: return the page size mask name aligns with value, but the doc can be clearer

PAGE_SIZE_MASK stores page_size - 1. The doc line says “Typically 0xfff for 4KiB pages” which is correct; consider also mentioning it’s page_size - 1 to aid future readers.

-/// Page size mask. Typically 0xfff for 4KiB pages.
+/// Page size mask (`page_size - 1`). Typically `0xfff` for 4KiB pages.
lib/segment/src/index/hnsw_index/graph_links/view.rs (1)

271-275: Optional: prefetch vectors only when links block is tiny

If profiles show many small neighbor blocks barely spanning a page, consider moving the hint to just before reading vectors (after computing pos), and pass &neighbors[pos..end]. This narrows readahead to the typically largest portion. Keep as-is if the broader prefetch proves better in practice.

📜 Review details

Configuration used: CodeRabbit UI

Review profile: CHILL

Plan: Pro

💡 Knowledge Base configuration:

  • MCP integration is disabled by default for public repositories
  • Jira integration is disabled by default for public repositories
  • Linear integration is disabled by default for public repositories

You can enable these sources in your CodeRabbit configuration.

📥 Commits

Reviewing files that changed from the base of the PR and between 5235d86 and e7dff7e.

📒 Files selected for processing (3)
  • Cargo.toml (1 hunks)
  • lib/common/memory/src/madvise.rs (1 hunks)
  • lib/segment/src/index/hnsw_index/graph_links/view.rs (1 hunks)
🧰 Additional context used
🧠 Learnings (2)
📓 Common learnings
Learnt from: IvanPleshkov
PR: qdrant/qdrant#7043
File: lib/segment/src/vector_storage/quantized/quantized_mmap_storage.rs:86-90
Timestamp: 2025-08-15T11:42:00.297Z
Learning: In lib/segment/src/vector_storage/quantized/quantized_mmap_storage.rs, overflow protection for encoded_storage_size computation (quantized_vector_size * vectors_count) is implemented in PR #7048, using checked_mul with u64 casting to prevent silent overflow.
📚 Learning: 2025-08-06T09:56:59.311Z
Learnt from: xzfc
PR: qdrant/qdrant#6982
File: lib/segment/src/index/hnsw_index/graph_links/view.rs:217-220
Timestamp: 2025-08-06T09:56:59.311Z
Learning: In Qdrant's GraphLinksView implementation (lib/segment/src/index/hnsw_index/graph_links/view.rs), methods like links() and links_with_vectors() intentionally use unwrap() and can panic for performance reasons, maintaining consistency across similar methods. Error handling improvements are considered secondary to feature completion and require benchmarking before implementation.

Applied to files:

  • lib/segment/src/index/hnsw_index/graph_links/view.rs
🧬 Code graph analysis (1)
lib/segment/src/index/hnsw_index/graph_links/view.rs (1)
lib/common/memory/src/madvise.rs (6)
  • madvise (93-95)
  • madvise (101-101)
  • madvise (107-113)
  • madvise (131-137)
  • will_need_multiple_pages (184-209)
  • will_need_multiple_pages (212-212)
⏰ Context from checks skipped due to timeout of 90000ms. You can increase the timeout in your CodeRabbit configuration to a maximum of 15 minutes (900000ms). (12)
  • GitHub Check: e2e-tests
  • GitHub Check: test-shard-snapshot-api-s3-minio
  • GitHub Check: test-consistency
  • GitHub Check: test-consensus-compose
  • GitHub Check: integration-tests-consensus
  • GitHub Check: integration-tests
  • GitHub Check: rust-tests (macos-latest)
  • GitHub Check: rust-tests (windows-latest)
  • GitHub Check: rust-tests (ubuntu-latest)
  • GitHub Check: storage-compat-test
  • GitHub Check: rust-tests-no-rocksdb (ubuntu-latest)
  • GitHub Check: lint
🔇 Additional comments (2)
Cargo.toml (1)

229-229: Fix nix feature gating: use "unistd" (not "feature") to enable sysconf()

get_page_mask() uses nix::unistd::sysconf, which requires the unistd feature. "feature" is not a valid nix feature and will cause a compile-time error.

Apply this diff:

-nix = { version = "0.29", features = ["fs", "feature"] }
+nix = { version = "0.29", features = ["fs", "unistd"] }

Likely an incorrect or invalid review comment.

lib/segment/src/index/hnsw_index/graph_links/view.rs (1)

271-275: LGTM: targeted readahead hint before decoding links + vectors

Calling memory::madvise::will_need_multiple_pages(&neighbors[start..end]) here is well-placed: it prefetches exactly the slice that includes the varint length, compressed links, padding, and vectors, and becomes a no-op for single-page regions or non-Unix builds. Minimal overhead due to the early page-span check and LazyLock caching.

@xzfc xzfc merged commit f2ce816 into dev Aug 25, 2025
16 checks passed
@xzfc xzfc deleted the will_need_multiple_pages branch August 25, 2025 09:22
Comment thread lib/common/memory/src/madvise.rs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants