@@ -21,25 +21,25 @@ class Args:
21
21
data_dirs : list [str ]
22
22
data_mix_weights : list [float ]
23
23
24
- max_seeds_to_collect : int = field (default = 37340 )
24
+ max_seeds_to_collect : int = field (default = 100000000 )
25
25
continue_from : str | None = field (default = None )
26
26
27
27
# Keep the following arguments unchanged for reproducibility
28
28
seed : int = field (default = 976 )
29
29
30
30
min_lines : int = field (default = 5 )
31
- max_lines : int = field (default = 25 )
31
+ max_lines : int = field (default = 30 )
32
32
min_doc_lines : int = field (default = 10 )
33
- max_doc_lines : int = field (default = 1000 )
33
+ max_doc_lines : int = field (default = 5000 )
34
34
max_avg_chars_per_line : int = field (default = 80 )
35
35
# max_fragments: int = field(default=3)
36
36
chunk_size : int = field (default = 1000 )
37
37
# A small value lets one document be used by multiple seeds
38
- content_chunk_lines : int = field (default = 99999999999 )
38
+ content_chunk_lines : int = field (default = 100 )
39
39
40
40
dataset_name : str = field (default = "bigcode/starcoderdata" )
41
41
data_files : list [str ] | None = field (default = None )
42
- max_considered_data : int | None = field (default = 200000 )
42
+ max_considered_data : int | None = field (default = 500000000 )
43
43
44
44
collect_function : bool = field (default = False )
45
45
max_nodes_to_traverse : int = field (default = 20000 )
@@ -576,7 +576,7 @@ def main():
576
576
def get_seed_text (seed : str ) -> str :
577
577
return "" .join (seed .split ())
578
578
579
- pbar = tqdm (total = args .max_seeds_to_collect )
579
+ pbar = tqdm (total = min ( args .max_seeds_to_collect , len ( dataset )) )
580
580
for example in dataset :
581
581
if n_success >= args .max_seeds_to_collect :
582
582
break
0 commit comments