-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Description
git_diff_find_similar can sometimes fail to detect identical files if:
- The file is untracked in one side of the tree, so the similarity measurement is not short-circuited by OID comparison.
- Not run with exact match only mode, since that forces OID comparison even for untracked files.
- The file contains very long lines (>4096 characters). These lines contains fairly random characters.
libgit2 version: 1.9.2 (1.9.2-1 from ArchLinux)
To reproduce:
- Place some large random files inside a new repository:
mkdir repo
cd repo
git init
for SIZE in 1024 2048 4096 8192 16384 32768; do tr -dc "A-Za-z0-9" </dev/urandom | head -c $SIZE > random.$SIZE.txt; done
git add .
git commit -m "Init"
- Move them
for F in *; do mv $F $F.moved; done
-
Run
git_diff_tree_to_workdirandgit_diff_find_similarin the repo, with flagsGIT_DIFF_INCLUDE_UNTRACKED | GIT_DIFF_RECURSE_UNTRACKED_DIRSandGIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_FOR_UNTRACKED.Here is a demonstrative snippet created by Gemini in the spoiler.
diff.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <git2.h>
void check_error(int error, const char *message) {
if (error < 0) {
const git_error *e = giterr_last();
fprintf(stderr, "Error %s: %s\n", message, (e) ? e->message : "Unknown Error");
exit(1);
}
}
const char* status_to_string(git_delta_t status) {
switch (status) {
case GIT_DELTA_UNMODIFIED: return "U";
case GIT_DELTA_ADDED: return "A";
case GIT_DELTA_DELETED: return "D";
case GIT_DELTA_MODIFIED: return "M";
case GIT_DELTA_RENAMED: return "R";
case GIT_DELTA_COPIED: return "C";
case GIT_DELTA_IGNORED: return "I";
case GIT_DELTA_UNTRACKED: return "?";
case GIT_DELTA_TYPECHANGE: return "T";
default: return "X";
}
}
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <repo_path> <similarity_threshold>\n", argv[0]);
fprintf(stderr, " <repo_path>: Path to the git repository\n");
fprintf(stderr, " <similarity_threshold>: A number 0-100, or 'exact'\n");
return 1;
}
const char *repo_path = argv[1];
const char *sim_arg = argv[2];
int use_exact = 0;
long threshold_val = 50; // Default
// Parse similarity argument
if (strcmp(sim_arg, "exact") == 0) {
use_exact = 1;
} else {
char *endptr;
threshold_val = strtol(sim_arg, &endptr, 10);
if (*endptr != '\0' || threshold_val < 0 || threshold_val > 100) {
fprintf(stderr, "Error: Second argument must be 'exact' or a number between 0 and 100.\n");
return 1;
}
}
git_libgit2_init();
git_repository *repo = NULL;
git_object *head_obj = NULL;
git_tree *head_tree = NULL;
git_diff *diff = NULL;
int error;
// 1. Open the repository from CLI arg
error = git_repository_open(&repo, repo_path);
check_error(error, "opening repository");
// 2. Resolve HEAD to a tree
error = git_revparse_single(&head_obj, repo, "HEAD^{tree}");
check_error(error, "resolving HEAD tree");
head_tree = (git_tree *)head_obj;
// 3. Configure Diff Options (Include Untracked)
git_diff_options opts = GIT_DIFF_OPTIONS_INIT;
opts.flags = GIT_DIFF_INCLUDE_UNTRACKED | GIT_DIFF_RECURSE_UNTRACKED_DIRS;
// 4. Diff Tree (HEAD) -> Working Directory
error = git_diff_tree_to_workdir(&diff, repo, head_tree, &opts);
check_error(error, "generating diff");
// 5. Configure Rename Detection based on CLI arg
git_diff_find_options find_opts = GIT_DIFF_FIND_OPTIONS_INIT;
find_opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_FOR_UNTRACKED;
if (use_exact) {
find_opts.flags |= GIT_DIFF_FIND_EXACT_MATCH_ONLY;
printf("Config: Exact Match Only\n");
} else {
find_opts.rename_threshold = (uint16_t)threshold_val;
printf("Config: Fuzzy Match (Threshold: %ld%%)\n", threshold_val);
}
error = git_diff_find_similar(diff, &find_opts);
check_error(error, "finding renames");
// 6. Iterate and Print Deltas
size_t num_deltas = git_diff_num_deltas(diff);
printf("Found %zu deltas in '%s':\n", num_deltas, repo_path);
printf("Stat Path\n");
printf("---- ----\n");
for (size_t i = 0; i < num_deltas; ++i) {
const git_diff_delta *delta = git_diff_get_delta(diff, i);
if (delta->status == GIT_DELTA_RENAMED) {
printf("[%s] %s -> %s\n",
status_to_string(delta->status),
delta->old_file.path,
delta->new_file.path);
} else {
printf("[%s] %s\n",
status_to_string(delta->status),
delta->new_file.path);
}
}
// Cleanup
git_diff_free(diff);
git_tree_free(head_tree);
git_repository_free(repo);
git_libgit2_shutdown();
return 0;
}cd ..
gcc -o diff diff.c -lgit2
./diff ./repo 50
This should output (with some randomness):
Config: Fuzzy Match (Threshold: 50%)
Found 9 deltas in '.':
Stat Path
---- ----
[R] random.1024.txt -> random.1024.txt.moved
[D] random.16384.txt
[?] random.16384.txt.moved
[R] random.2048.txt -> random.2048.txt.moved
[D] random.32768.txt
[?] random.32768.txt.moved
[R] random.4096.txt -> random.4096.txt.moved
[D] random.8192.txt
[?] random.8192.txt.moved
Running with exact match (GIT_DIFF_FIND_EXACT_MATCH_ONLY):
./diff ./repo exact
Config: Exact Match Only
Found 6 deltas in '.':
Stat Path
---- ----
[R] random.1024.txt -> random.1024.txt.moved
[R] random.16384.txt -> random.16384.txt.moved
[R] random.2048.txt -> random.2048.txt.moved
[R] random.32768.txt -> random.32768.txt.moved
[R] random.4096.txt -> random.4096.txt.moved
[R] random.8192.txt -> random.8192.txt.moved
Additional notes
It seems that for shorter random files (<= 4096 in the experiment above), the measured similarity is 100. Setting threshold to 100 would still produce a match. For longer files, by bisecting the threshold, we can determine the measured similarity:
- 8192: between 49 and 50
- 16384: between 30 and 31
- 32768: between 27 and 28
I assume the similarity measurement is deterministic? So it is fairly surprising that identical files can be measured as different files.
If the long lines only contain a single character, the match can still fail, but with much higher measured similarity w.r.t. its line length.
➜ test git:(master) ✗ printf 'A%.0s' {1..65536} > A.65536.txt
➜ test git:(master) ✗ git add A.65536.txt && git commit -m "A"
[master 90ff115] A
1 file changed, 1 insertion(+)
create mode 100644 A.65536.txt
➜ test git:(master) ✗ mv A.65536.txt A.65536.txt.moved
➜ test git:(master) ✗ ../diff/diff . 100
Config: Fuzzy Match (Threshold: 100%)
Found 11 deltas in '.':
Stat Path
---- ----
[D] A.65536.txt
[?] A.65536.txt.moved
... (Some other files)
But it will match with threshold 90.