8000 feat: option to ignore seed check when seed pool is large · bigcode-project/selfcodealign@04a872a · GitHub
[go: up one dir, main page]

Skip to content

Commit 04a872a

Browse files
committed
feat: option to ignore seed check when seed pool is large
1 parent 1d85858 commit 04a872a

File tree

2 files changed

+12
-9
lines changed

2 files changed

+12
-9
lines changed

src/star_align/collect_snippets.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,25 +21,25 @@ class Args:
2121
data_dirs: list[str]
2222
data_mix_weights: list[float]
2323

24-
max_seeds_to_collect: int = field(default=37340)
24+
max_seeds_to_collect: int = field(default=100000000)
2525
continue_from: str | None = field(default=None)
2626

2727
# Keep the following arguments unchanged for reproducibility
2828
seed: int = field(default=976)
2929

3030
min_lines: int = field(default=5)
31-
max_lines: int = field(default=25)
31+
max_lines: int = field(default=30)
3232
min_doc_lines: int = field(default=10)
33-
max_doc_lines: int = field(default=1000)
33+
max_doc_lines: int = field(default=5000)
3434
max_avg_chars_per_line: int = field(default=80)
3535
# max_fragments: int = field(default=3)
3636
chunk_size: int = field(default=1000)
3737
# A small value lets one document be used by multiple seeds
38-
content_chunk_lines: int = field(default=99999999999)
38+
content_chunk_lines: int = field(default=100)
3939

4040
dataset_name: str = field(default="bigcode/starcoderdata")
4141
data_files: list[str] | None = field(default=None)
42-
max_considered_data: int | None = field(default=200000)
42+
max_considered_data: int | None = field(default=500000000)
4343

4444
collect_function: bool = field(default=False)
4545
max_nodes_to_traverse: int = field(default=20000)
@@ -576,7 +576,7 @@ def main():
576576
def get_seed_text(seed: str) -> str:
577577
return "".join(seed.split())
578578

579-
pbar = tqdm(total=args.max_seeds_to_collect)
579+
pbar = tqdm(total=min(args.max_seeds_to_collect, len(dataset)))
580580
for example in dataset:
581581
if n_success >= args.max_seeds_to_collect:
582582
break

src/star_align/self_ossinstruct.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -411,9 +411,12 @@ async def main():
411411
num_proc=star_align.utils.N_CORES,
412412
)
413413
id_key = "seed"
414-
assert len(set(d[id_key] for d in raw_dataset)) == len(
415-
raw_dataset
416-
), "Duplicate seeds appear in the dataset"
414+
if os.getenv("IGNORE_SEED_CHECK") is None:
415+
assert len(set(d[id_key] for d in raw_dataset)) == len(
416+
raw_dataset
417+
), "Duplicate seeds appear in the dataset"
418+
else:
419+
print("[Warning] Ignoring seed check")
417420

418421
# Every run should produce the same data as long as the default params are not changed
419422
start_index = args.seed_code_start_index

0 commit comments

Comments
 (0)
0