From b9c329cbdb35aa5fb7878e615500c9d41bfa356a Mon Sep 17 00:00:00 2001 From: Tim Adamson Date: Thu, 12 Mar 2026 18:39:00 +1100 Subject: [PATCH] perf: cache .gitignore content to optimize fsmonitor-backed status Introduce in-memory caching of .gitignore file contents within the untracked cache to improve performance when fsmonitor is active. When fsmonitor confirms a directory is unchanged, we can safely reuse the cached .gitignore content and OID without re-reading and re-hashing the file from disk. This eliminates expensive prep_exclude() calls that would otherwise open, read, and hash every .gitignore file along the path hierarchy. For repositories with many .gitignore files, this provides significant performance improvements during status operations. The optimization works by: - Storing raw .gitignore content in untracked_cache_dir (memory only) - Skipping prep_exclude() entirely when fsmonitor validates the dir - Reusing cached content when rebuilding exclude stacks for invalidated child directories - Tracking .gitignore file changes via fsmonitor to ensure correctness The exclude patterns are still loaded lazily when actually needed for files in invalidated subdirectories. Add trace2 metrics to measure the effectiveness: gitignore-skipped counts directories where prep_exclude was avoided, and gitignore-cached counts reuses of cached content. Co-authored-by: Forge --- dir.c | 129 ++++++++++++++++++++++++++++-- dir.h | 17 ++++ fsmonitor.c | 31 +++++++- t/t7519-status-fsmonitor.sh | 152 ++++++++++++++++++++++++++++++++++++ 4 files changed, 322 insertions(+), 7 deletions(-) diff --git a/dir.c b/dir.c index 026d8516a912af..37d3ba9aee3577 100644 --- a/dir.c +++ b/dir.c @@ -28,6 +28,7 @@ #include "ewah/ewok.h" #include "fsmonitor-ll.h" #include "read-cache-ll.h" +#include "trace.h" #include "setup.h" #include "sparse-index.h" #include "strbuf.h" @@ -1098,6 +1099,12 @@ static void do_invalidate_gitignore(struct untracked_cache_dir *dir) { int i; dir->valid = 0; + /* + * Clear the cached .gitignore content since the file may have + * changed. It will be re-read from disk on the next access. + */ + FREE_AND_NULL(dir->exclude_content); + dir->exclude_content_len = 0; for (size_t i = 0; i < dir->untracked_nr; i++) free(dir->untracked[i]); dir->untracked_nr = 0; @@ -1127,6 +1134,8 @@ static void invalidate_directory(struct untracked_cache *uc, uc->dir_invalidated++; dir->valid = 0; + FREE_AND_NULL(dir->exclude_content); + dir->exclude_content_len = 0; for (size_t i = 0; i < dir->untracked_nr; i++) free(dir->untracked[i]); dir->untracked_nr = 0; @@ -1145,10 +1154,15 @@ static void invalidate_directory(struct untracked_cache *uc, * If "oid_stat" is not NULL, compute oid of the exclude file and fill * stat data from disk (only valid if add_patterns returns zero). If * oid_stat.valid is non-zero, "oid_stat" must contain good value as input. + * + * If "content_out" and "content_len_out" are not NULL, store a copy of + * the raw file content for later caching (e.g. in untracked_cache_dir). + * The caller is responsible for freeing *content_out. */ static int add_patterns(const char *fname, const char *base, int baselen, struct pattern_list *pl, struct index_state *istate, - unsigned flags, struct oid_stat *oid_stat) + unsigned flags, struct oid_stat *oid_stat, + char **content_out, size_t *content_len_out) { struct stat st; int r; @@ -1218,6 +1232,16 @@ static int add_patterns(const char *fname, const char *base, int baselen, return -1; } + /* + * If the caller wants to cache the raw content (for + * fsmonitor-backed reuse), store a copy before parsing + * modifies the buffer in-place. + */ + if (content_out && content_len_out) { + *content_out = xmemdupz(buf, size); + *content_len_out = size; + } + add_patterns_from_buffer(buf, size, base, baselen, pl); free(buf); return 0; @@ -1258,7 +1282,8 @@ int add_patterns_from_file_to_list(const char *fname, const char *base, struct index_state *istate, unsigned flags) { - return add_patterns(fname, base, baselen, pl, istate, flags, NULL); + return add_patterns(fname, base, baselen, pl, istate, flags, NULL, + NULL, NULL); } int add_patterns_from_blob_to_list( @@ -1315,7 +1340,7 @@ static void add_patterns_from_file_1(struct dir_struct *dir, const char *fname, if (!dir->untracked) dir->internal.unmanaged_exclude_files++; pl = add_pattern_list(dir, EXC_FILE, fname); - if (add_patterns(fname, "", 0, pl, NULL, 0, oid_stat) < 0) + if (add_patterns(fname, "", 0, pl, NULL, 0, oid_stat, NULL, NULL) < 0) die(_("cannot use %s as an exclude file"), fname); } @@ -1770,11 +1795,68 @@ static void prep_exclude(struct dir_struct *dir, strbuf_addbuf(&sb, &dir->internal.basebuf); strbuf_addstr(&sb, dir->exclude_per_dir); pl->src = strbuf_detach(&sb, NULL); - add_patterns(pl->src, pl->src, stk->baselen, pl, istate, - PATTERN_NOFOLLOW, - untracked ? &oid_stat : NULL); + + /* + * When fsmonitor is active and this directory is + * unchanged, check for cached .gitignore content + * from a previous load. This avoids re-reading the + * file from disk when we're building the exclude + * stack for an invalidated child directory. + */ + if (dir->untracked && + dir->untracked->use_fsmonitor && + untracked && untracked->valid && + untracked->exclude_content) { + char *buf_copy; + /* + * add_patterns_from_buffer() modifies the + * buffer in-place, so we must duplicate it. + */ + buf_copy = xmemdupz(untracked->exclude_content, + untracked->exclude_content_len); + add_patterns_from_buffer(buf_copy, + untracked->exclude_content_len, + pl->src, stk->baselen, pl); + free(buf_copy); + /* + * Trust the cached OID since fsmonitor + * guarantees the file hasn't changed. + */ + oidcpy(&oid_stat.oid, &untracked->exclude_oid); + dir->untracked->gitignore_cached++; + trace_printf_key(&trace_fsmonitor, + "prep_exclude: used cached " + ".gitignore for '%s'", + pl->src); + } else { + /* + * Read the .gitignore from disk. If the + * untracked cache is active, also cache the + * content for potential reuse when fsmonitor + * confirms the file hasn't changed. + */ + char *cached_content = NULL; + size_t cached_len = 0; + add_patterns(pl->src, pl->src, stk->baselen, + pl, istate, PATTERN_NOFOLLOW, + untracked ? &oid_stat : NULL, + untracked ? &cached_content : NULL, + untracked ? &cached_len : NULL); + if (untracked && cached_content) { + free(untracked->exclude_content); + untracked->exclude_content = cached_content; + untracked->exclude_content_len = cached_len; + } + } } /* + * With the fsmonitor optimization in valid_cached_dir(), the + * NEEDSWORK below is partially addressed: when the cache is + * fully valid (confirmed by fsmonitor), prep_exclude() is not + * called at all from valid_cached_dir(). It is only called + * here when building the exclude stack for an invalidated + * child directory, where the patterns ARE needed. + * * NEEDSWORK: when untracked cache is enabled, prep_exclude() * will first be called in valid_cached_dir() then maybe many * times more in last_matching_pattern(). When the cache is @@ -2550,6 +2632,34 @@ static int valid_cached_dir(struct dir_struct *dir, if (untracked->check_only != !!check_only) return 0; + /* + * When fsmonitor is active and confirms this directory is + * unchanged, we can trust the cached exclude_oid without + * re-reading and re-hashing the .gitignore file from disk. + * The fsmonitor guarantees that if anything in this directory + * changed (including the .gitignore file), the directory would + * have been invalidated via untracked_cache_invalidate_trimmed_path(). + * + * This avoids the expensive prep_exclude() call which would + * open, read, and hash every .gitignore file along the path, + * only to confirm the OID hasn't changed. For repositories + * with many .gitignore files, this is a significant performance + * improvement. + * + * The exclude patterns will still be loaded lazily by + * prep_exclude() if they are actually needed later (e.g. when + * last_matching_pattern() is called for files in invalidated + * child directories). + */ + if (dir->untracked->use_fsmonitor && untracked->valid) { + dir->untracked->gitignore_skipped++; + trace_printf_key(&trace_fsmonitor, + "valid_cached_dir: skip prep_exclude for " + "fsmonitor-valid dir '%s'", + path->buf); + return 1; + } + /* * prep_exclude will be called eventually on this directory, * but it's called much later in last_matching_pattern(). We @@ -3130,6 +3240,12 @@ static void emit_traversal_statistics(struct dir_struct *dir, dir->untracked->dir_invalidated); trace2_data_intmax("read_directory", repo, "opendir", dir->untracked->dir_opened); + trace2_data_intmax("read_directory", repo, + "gitignore-skipped", + dir->untracked->gitignore_skipped); + trace2_data_intmax("read_directory", repo, + "gitignore-cached", + dir->untracked->gitignore_cached); } int read_directory(struct dir_struct *dir, struct index_state *istate, @@ -3732,6 +3848,7 @@ static void free_untracked(struct untracked_cache_dir *ucd) free(ucd->untracked[i]); free(ucd->untracked); free(ucd->dirs); + free(ucd->exclude_content); free(ucd); } diff --git a/dir.h b/dir.h index 20d4a078d61ef8..6cd33383fde5ce 100644 --- a/dir.h +++ b/dir.h @@ -184,6 +184,14 @@ struct untracked_cache_dir { unsigned int recurse : 1; /* null object ID means this directory does not have .gitignore */ struct object_id exclude_oid; + /* + * In-memory cache of .gitignore file content for fsmonitor + * optimization. When fsmonitor confirms a directory is unchanged, + * we can reuse this cached content instead of re-reading from disk. + * This field is NOT serialized to the index extension. + */ + char *exclude_content; + size_t exclude_content_len; char name[FLEX_ARRAY]; }; @@ -204,8 +212,17 @@ struct untracked_cache { int gitignore_invalidated; int dir_invalidated; int dir_opened; + int gitignore_skipped; /* prep_exclude() skipped via fsmonitor */ + int gitignore_cached; /* prep_exclude() used cached content */ /* fsmonitor invalidation data */ unsigned int use_fsmonitor : 1; + /* + * Set during refresh_fsmonitor() if any .gitignore file was + * reported as changed. This enables targeted invalidation: + * when no .gitignore files changed, all cached exclude results + * can be fully trusted without re-reading any .gitignore files. + */ + unsigned int gitignore_changed : 1; }; /** diff --git a/fsmonitor.c b/fsmonitor.c index d07dc18967ae33..aa3ecbe93f95b5 100644 --- a/fsmonitor.c +++ b/fsmonitor.c @@ -442,6 +442,25 @@ static void fsmonitor_refresh_callback(struct index_state *istate, char *name) "fsmonitor_refresh_callback '%s' (pos %d)", name, pos); + /* + * Detect changes to .gitignore files so we can set the + * gitignore_changed flag on the untracked cache. This allows + * valid_cached_dir() to know whether any ignore rules may + * have changed, enabling more aggressive caching when no + * .gitignore files were modified. + */ + if (istate->untracked && istate->untracked->exclude_per_dir) { + const char *base = strrchr(name, '/'); + const char *filename = base ? base + 1 : name; + if (!strcmp(filename, istate->untracked->exclude_per_dir)) { + istate->untracked->gitignore_changed = 1; + trace_printf_key(&trace_fsmonitor, + "fsmonitor_refresh_callback: " + ".gitignore changed '%s'", + name); + } + } + if (name[len - 1] == '/') nr_in_cone = handle_path_with_trailing_slash(istate, name, pos); else @@ -657,6 +676,14 @@ void refresh_fsmonitor(struct index_state *istate) */ trace2_region_enter("fsmonitor", "apply_results", istate->repo); + /* + * Reset the gitignore_changed flag before processing results. + * It will be set by fsmonitor_refresh_callback() if any + * .gitignore file appears in the changed path list. + */ + if (istate->untracked) + istate->untracked->gitignore_changed = 0; + if (query_success && !is_trivial) { /* * Mark all pathnames returned by the monitor as dirty. @@ -713,8 +740,10 @@ void refresh_fsmonitor(struct index_state *istate) if (is_cache_changed) istate->cache_changed |= FSMONITOR_CHANGED; - if (istate->untracked) + if (istate->untracked) { istate->untracked->use_fsmonitor = 0; + istate->untracked->gitignore_changed = 1; + } } trace2_region_leave("fsmonitor", "apply_results", istate->repo); diff --git a/t/t7519-status-fsmonitor.sh b/t/t7519-status-fsmonitor.sh index 7ee69ecdd4aa2c..4eda8099c0e7d3 100755 --- a/t/t7519-status-fsmonitor.sh +++ b/t/t7519-status-fsmonitor.sh @@ -477,4 +477,156 @@ test_expect_success 'status succeeds with sparse index' ' ) ' +test_expect_success UNTRACKED_CACHE 'fsmonitor skips .gitignore reading for valid dirs' ' + test_create_repo skip-gitignore && + ( + cd skip-gitignore && + + # Create a directory structure with multiple .gitignore files + mkdir -p dir1 dir2 dir3 && + : >tracked && + : >dir1/tracked && + : >dir2/tracked && + : >dir3/tracked && + echo "*.log" >.gitignore && + echo "*.tmp" >dir1/.gitignore && + echo "*.bak" >dir2/.gitignore && + echo "*.old" >dir3/.gitignore && + test-tool chmtime =-60 tracked dir1/tracked dir2/tracked dir3/tracked && + test-tool chmtime =-60 .gitignore dir1/.gitignore dir2/.gitignore dir3/.gitignore && + test-tool chmtime =-60 dir1 dir2 dir3 . && + git add tracked dir1/tracked dir2/tracked dir3/tracked && + git add .gitignore dir1/.gitignore dir2/.gitignore dir3/.gitignore && + git commit -m "initial" && + + # Install a no-change fsmonitor hook + test_hook --setup --clobber fsmonitor-test <<-\EOF && + printf "last_update_token\0" + EOF + git config core.fsmonitor .git/hooks/fsmonitor-test && + git update-index --untracked-cache && + git update-index --fsmonitor && + + # First status populates the cache + git status && + + # Second status should use the cache + GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace-skip" \ + git status && + + # Verify the optimization is working: gitignore-skipped should be + # non-zero (directories whose prep_exclude was skipped thanks to + # fsmonitor confirming they are unchanged) + grep "gitignore-skipped" "$TRASH_DIRECTORY/trace-skip" >../trace-skip-lines && + # Check that the value after the colon is > 0 + grep "gitignore-skipped:[1-9]" ../trace-skip-lines + ) +' + +test_expect_success UNTRACKED_CACHE 'fsmonitor correctly invalidates on .gitignore change' ' + test_create_repo gitignore-invalidate && + ( + cd gitignore-invalidate && + + # Set up repo with .gitignore + mkdir -p dir1 && + : >tracked && + : >dir1/tracked && + echo "*.log" >.gitignore && + test-tool chmtime =-60 tracked dir1/tracked .gitignore && + test-tool chmtime =-60 dir1 . && + git add tracked dir1/tracked .gitignore && + git commit -m "initial" && + + # Install fsmonitor hook that reports no changes initially + test_hook --setup --clobber fsmonitor-test <<-\EOF && + printf "last_update_token\0" + EOF + git config core.fsmonitor .git/hooks/fsmonitor-test && + git update-index --untracked-cache && + git update-index --fsmonitor && + + # Populate the cache + git status && + git status && + + # Create test files - one should be ignored, one should not + : >dir1/test.log && + : >dir1/test.txt && + + # Now install a hook that reports the .gitignore changed + # plus the new files + test_hook --clobber fsmonitor-test <<-\EOF && + printf "last_update_token\0" + printf ".gitignore\0" + printf "dir1/test.log\0" + printf "dir1/test.txt\0" + printf "dir1\0" + EOF + + # Status should correctly apply ignore rules even though + # the cache was invalidated + git status --porcelain >../actual-invalidate && + echo "?? dir1/test.txt" >../expect-invalidate && + test_cmp ../expect-invalidate ../actual-invalidate + ) +' + +test_expect_success UNTRACKED_CACHE 'fsmonitor status correct with many .gitignore files' ' + test_create_repo many-gitignores && + ( + cd many-gitignores && + + # Create a deeper directory tree with .gitignore at each level + mkdir -p a/b/c/d && + : >tracked && + : >a/tracked && + : >a/b/tracked && + : >a/b/c/tracked && + : >a/b/c/d/tracked && + echo "*.root-ignored" >.gitignore && + echo "*.a-ignored" >a/.gitignore && + echo "*.b-ignored" >a/b/.gitignore && + echo "*.c-ignored" >a/b/c/.gitignore && + echo "*.d-ignored" >a/b/c/d/.gitignore && + git add -A && + git commit -m "initial" && + + # Install no-change fsmonitor hook + test_hook --setup --clobber fsmonitor-test <<-\EOF && + printf "last_update_token\0" + EOF + git config core.fsmonitor .git/hooks/fsmonitor-test && + git update-index --untracked-cache && + git update-index --fsmonitor && + + # Populate and warm the cache + git status && + git status && + + # Add files at the deepest level - some ignored, some not + : >a/b/c/d/file.root-ignored && + : >a/b/c/d/file.d-ignored && + : >a/b/c/d/file.txt && + : >a/b/c/d/file.a-ignored && + + # Report only the deepest dir as changed + test_hook --clobber fsmonitor-test <<-\EOF && + printf "last_update_token\0" + printf "a/b/c/d/file.root-ignored\0" + printf "a/b/c/d/file.d-ignored\0" + printf "a/b/c/d/file.txt\0" + printf "a/b/c/d/file.a-ignored\0" + printf "a/b/c/d\0" + EOF + + # Status should correctly evaluate all parent .gitignore + # rules and only show the non-ignored file. + # Ignore rules cascade: root, a/, a/b/, a/b/c/, a/b/c/d/ + git status --porcelain >../actual-many && + echo "?? a/b/c/d/file.txt" >../expect-many && + test_cmp ../expect-many ../actual-many + ) +' + test_done