bigcode-project
diff --git a/‎README.md
Lines changed: 61 additions & 12 deletions
< 8000 /div> b/‎README.md
Lines changed: 61 additions & 12 deletions
< 8000 /div>
diff --git a/‎requirements.txt
Lines changed: 1 addition & 0 deletions b/‎requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/star_align/clean_data.py
Lines changed: 93 additions & 15 deletions b/‎src/star_align/clean_data.py
Lines changed: 93 additions & 15 deletions
@@ -1,4 +1,4 @@
-# StarCoder2-Instruct 
+# StarCoder2-Instruct: Self-Aligned, Transparent, and Fully Permissive
 
 > [!WARNING]
 > This documentation is still WIP.
@@ -7,28 +7,77 @@
 
 We used VLLM's [OpenAI compatible server](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) for data generation. So, before running the following commands, make sure the VLLM server is running, and the associated `openai` environment variables are set.
 
-**Snippet to concept generation:**
+For example, you can start an VLLM server with `docker`:
 
 ```shell
-python src/star_align/self_ossinstruct.py --instruct_mode "S->C" --seed_data_files /path/to/seeds.jsonl --max_new_data 50000 --tag $TAG --temperature 0.7 --seed_code_start_index 0 --model bigcode/starcoder2-15b --num_fewshots 8 --num_batched_requests 128 --num_sample_per_request 1
+docker run --gpus '"device=0"' \
+    -v $HF_HOME:/root/.cache/huggingface \                            
+    -p 10000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:v0.3.3 \
+    --model bigcode/starcoder2-15b \
+    --tensor-parallel-size 1 --dtype bfloat16
 ```
 
-**Concept to instruction generation:**
+And then set the environment variables as follows:
 
 ```shell
-python src/star_align/self_ossinstruct.py --instruct_mode "C->I" --seed_data_files /path/to/seeds.jsonl --max_new_data 50000 --tag $TAG --temperature 0.7 --seed_code_start_index 0 --model bigcode/starcoder2-15b -–num_fewshots 8 --num_sample_per_request 1 --num_batched_request 128
+export OPENAI_API_KEY="EMPTY"
+export OPENAI_BASE_URL="http://localhost:10000/v1/"
 ```
 
-**Instruction to response + self-validation code generation:**
+### Snippet to concept
 
 ```shell
-python src/star_align/self_ossinstruct.py --instruct_mode "I->R" --seed_data_files path/to/instructions.jsonl  --max_new_data 50000 --tag $TAG --seed_code_start_index 0 --model bigcode/starcoder2-15b --num_fewshots 1 --num_batched_request 16 --num_sample_per_request 10 --temperature 0.7
+python src/star_align/self_ossinstruct.py \
+    --instruct_mode "S->C" \
+    --seed_data_files /path/to/seeds.jsonl \
+    --max_new_data 50000 \
+    --tag concept_gen \
+    --temperature 0.7 \
+    --seed_code_start_index 0 \
+    --model bigcode/starcoder2-15b \
+    --num_fewshots 8 \
+    --num_batched_requests 32 \
+    --num_sample_per_request 1
 ```
 
-**Execution filtering:**
+### Concept to instruction
+
+```shell
+python src/star_align/self_ossinstruct.py \
+    --instruct_mode "C->I" \
+    --seed_data_files /path/to/concepts.jsonl \
+    --max_new_data 50000 \
+    --tag instruction_gen \
+    --temperature 0.7 \
+    --seed_code_start_index 0 \
+    --model bigcode/starcoder2-15b \
+    --num_fewshots 8 \
+    --num_sample_per_request 1 \
+    --num_batched_request 32
+```
+
+### Instruction to response w/ self-validation code
+
+```shell
+python src/star_align/self_ossinstruct.py \
+    --instruct_mode "I->R" \
+    --seed_data_files path/to/instructions.jsonl  \
+    --max_new_data 50000 \
+    --tag response_gen \
+    --seed_code_start_index 0 \
+    --model bigcode/starcoder2-15b \
+    --num_fewshots 1 \
+    --num_batched_request 8 \
+    --num_sample_per_request 10 \
+    --temperature 0.7
+```
+
+### Execution filter
 
 > [!WARNING]
-> Though we implemented reliability guards, it is highly recommended to run execution in a sandbox environment.
+> Though we implemented reliability guards, it is highly recommended to run execution in a sandbox environment. The command below doesn't provide sandboxing by default.
 
 ```shell
 python src/star_align/execution_filter.py --response_path /path/to/response.jsonl --result_path /path/to/filtered.jsonl
@@ -37,10 +86,10 @@ python src/star_align/execution_filter.py --response_path /path/to/response.json
 # Note that filtered.jsonl may contain multiple passing samples for the same instruction which needs further selection.
 ```
 
-**Data sanitization and selection:**
+### Data sanitization and selection
 
 ```shell
-RAW=1 python src/star_align/tools/sanitize_data.py /path/to/filtered.jsonl /path/to/sanitized.jsonl
+RAW=1 python src/star_align/sanitize_data.py /path/to/filtered.jsonl /path/to/sanitized.jsonl
 python src/star_align/clean_data.py --data_files /path/to/sanitized.jsonl --output_file /path/to/sanitized.jsonl --diversify_func_names
-SMART=1 python src/star_align/tools/sanitize_data.py /path/to/sanitized.jsonl /path/to/sanitized.jsonl
+SMART=1 python src/star_align/sanitize_data.py /path/to/sanitized.jsonl /path/to/sanitized.jsonl
 ```
@@ -4,4 +4,5 @@ openai>=1.3.7
 tenacity~=8.2.3
 tiktoken~=0.5.1
 accelerate==0.27.2
+datasets~=2.17.1
 git+https://github.com/evalplus/evalplus.git@25e195e024b614f2671ad9ac5b8fdcd9b95a2b24#egg=evalplus
@@ -1,19 +1,97 @@
-from star_align.utils import read_jsonl, write_jsonl
-import sys
-import re
+import ast
+import random
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import cast
 
-dataset = read_jsonl(sys.argv[1])
+from tqdm.auto import tqdm
+from transformers import HfArgumentParser
 
-def contains_chinese(s):
-    return bool(re.search(r'[\u4e00-\u9fff]', s))
+from star_align.utils import find_code_blocks, read_jsonl, write_jsonl
 
-chosen = []
-for example in dataset:
-    if "code snippet" in example["instruction"] or contains_chinese(example["instruction"] + example["response"]):
-        rejected.append(example)
-    else:
-        chosen.append(example)
 
-print(f"Removed {len(dataset) - len(chosen)} examples")
-write_jsonl(sys.argv[2], chosen)
+@dataclass(frozen=True)
+class Args:
+    data_files: list[str]
+    output_file: str
+    diversify_func_names: bool = field(default=False)
+
+
+def extract_and_concat_function_names(python_content):
+    """
+    Extracts all function names from a given Python content string and concatenates them into a single string.
+
+    Parameters:
+    - python_content: A string containing the Python code to analyze.
+
+    Returns:
+    - A string containing all function names defined in the content, concatenated.
+    """
+    tree = ast.parse(python_content)
+    function_names = []
+
+    # Define a node visitor that adds the name of each function definition it visits
+    class FunctionDefVisitor(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            function_names.append(node.name)
+            # Process the subtree for this node
+            self.generic_visit(node)
+
+        def visit_AsyncFunctionDef(self, node):
+            function_names.append(node.name)
+            self.generic_visit(node)
+
+    # Create a node visitor and walk through the AST
+    visitor = FunctionDefVisitor()
+    visitor.visit(tree)
+
+    # Concatenate all function names into a single string
+    return " ".join(function_names)
+
+
+def main():
+    args = cast(Args, HfArgumentParser(Args).parse_args_into_dataclasses()[0])
+    raw_data: list[dict] = []
+    for data_file in args.data_files:
+        data = read_jsonl(Path(data_file))
+        # language = data_file.split("-")[1]
+        # assert language in ALL_LANGS, f"Unknown language {language}"
+        # raw_data.extend(dict(lang=language, **d) for d in data)
+        raw_data.extend(data)
+    # common keys for all d in data
+    common_keys = set.intersection(*(set(d.keys()) for d in raw_data))
+    raw_data = [{k: d[k] for k in common_keys} for d in raw_data]
+    print(f"Common keys: {common_keys}")
+    # counter = defaultdict[str, int](int)
+
+    def mk_key(instruction: str) -> str:
+        return "".join(instruction.split())
+
+    random.seed(0)
+    random.shuffle(raw_data)
+
+    seen_keys = set[str]()
+    new_data = list[dict]()
+    for d in tqdm(raw_data):
+        key_i, key_r = mk_key(d["instruction"]), mk_key(d["response"])
+        if key_i in seen_keys or key_r in seen_keys:
+            continue
+        if args.diversify_func_names:
+            code_block = find_code_blocks(d["response"])[0]
+            try:
+                fn_names = extract_and_concat_function_names(code_block)
+            except SyntaxError:
+                continue
+            if fn_names in seen_keys:
+                continue
+            seen_keys.add(fn_names)
+        new_data.append(d)
+        
616B
seen_keys.add(key_i)
+        seen_keys.add(key_r)
+
+    print(f"Chose {len(new_data)} out of {len(raw_data)}")
+    write_jsonl(Path(args.output_file), new_data)
+
+
+if __name__ == "__main__":
+    main()