bigcode-project
diff --git a/‎evaluation/text2code_vllm.py
Lines changed: 59 additions & 24 deletions b/‎evaluation/text2code_vllm.py
Lines changed: 59 additions & 24 deletions
diff --git a/‎prompts/self-ossinstruct-fewshot.txt
Lines changed: 3 additions & 0 deletions b/‎prompts/self-ossinstruct-fewshot.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎requirements.txt
Lines changed: 1 addition & 0 deletions b/‎requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/star_align/collect_snippets.py
Lines changed: 3 additions & 2 deletions b/‎src/star_align/collect_snippets.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/star_align/sanitize_data.py
Lines changed: 5 additions & 0 deletions b/‎src/star_align/sanitize_data.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/star_align/self_ossinstruct.py
Lines changed: 17 additions & 8 deletions b/‎src/star_align/self_ossinstruct.py
Lines changed: 17 additions & 8 deletions
diff --git a/‎src/star_align/utils.py
Lines changed: 42 additions & 7 deletions b/‎src/star_align/utils.py
Lines changed: 42 additions & 7 deletions
@@ -4,20 +4,17 @@
 from typing import Literal, TypedDict, cast
 from evalplus.data import get_human_eval_plus, get_mbpp_plus, write_jsonl
 
-# from evoeval.data import get_evo_eval
+from evoeval.data import get_evo_eval
 from transformers import HfArgumentParser
 
-from star_align.prompt_template import SC2_INSTRUCT_PROMPT
-from star_align.utils import infer_prompt_template
+from star_align.utils import infer_prompt_template, is_base_model
 
 from vllm import LLM, SamplingParams
 
 
-PROMPT_TEMPLATE = SC2_INSTRUCT_PROMPT
-
-
 class Text2CodeProblem(TypedDict):
     id: str
+    prompt: str
     instruction: str
     response_prefix: str
 
@@ -39,6 +36,14 @@ def get_humaneval_raw_problems() -> list[dict]:
     return list(problems.values())
 
 
+def get_evoeval_raw_problems(dataset: str):
+    def get_raw_problems() -> list[dict]:
+        problems = get_evo_eval(dataset)
+        return list(problems.values())
+
+    return get_raw_problems
+
+
 def map_mbpp_problem(p: dict) -> Text2CodeProblem:
     id = p["task_id"]
     prompt = p["prompt"]
@@ -52,14 +57,16 @@ def map_mbpp_problem(p: dict) -> Text2CodeProblem:
     assertion = prompt[assert_index:].strip()
     instruction = f"""{instruction}
 
-Your code should pass the following assertion:
 ```python
 {assertion}
 ```"""
     prefix = ""
     response_prefix = f"""{prefix}```python"""
     return Text2CodeProblem(
-        id=str(id), instruction=instruction, response_prefix=response_prefix
+        id=str(id),
+        prompt=prompt,
+        instruction=instruction,
+        response_prefix=response_prefix,
     )
 
 
@@ -91,7 +98,10 @@ def map_humaneval_problem(p: dict) -> Text2CodeProblem:
     # response_prefix = f"""{prefix}```python
     # {prompt}"""
     return Text2CodeProblem(
-        id=id, instruction=instruction, response_prefix=response_prefix
+        id=id,
+        prompt=prompt,
+        instruction=instruction,
+        response_prefix=response_prefix,
     )
 
 
@@ -120,44 +130,69 @@ class Args:
 def main():
     args = cast(Args, HfArgumentParser(Args).parse_args_into_dataclasses()[0])
     raw_problem_fn, map_problem_fn = (
-        (get_humaneval_raw_problems, map_humaneval_problem)
-        if args.dataset == "humaneval"
-        else (get_mbpp_raw_problems, map_mbpp_problem)
+        (get_evoeval_raw_problems(args.dataset), map_humaneval_problem)
+        if args.dataset.startswith("EvoEval_")
+        else (
+            (get_humaneval_raw_problems, map_humaneval_problem)
+            if args.dataset == "humaneval"
+            else (get_mbpp_raw_problems, map_mbpp_problem)
+        )
     )
     raw_problems = raw_problem_fn()
     problems = list(map(map_problem_fn, raw_problems))
 
-    engine = LLM(args.model_name_or_path or args.model_key)
+    engine = LLM(
+        tokenizer=args.model_key, model=args.model_name_or_path or args.model_key
+    )
+
+    base_model_prompt = is_base_model(args.model_key)
+
+    stop: str | list[str] = (
+        "\n```\n"
+        if not base_model_prompt
+        else ["\ndef ", "\nclass ", "\nimport ", "\nfrom ", "\nassert ", "\n# "]
+    )
     sampling_params = SamplingParams(
         n=args.n_samples_per_problem,
         temperature=args.temperature,
         max_tokens=args.max_new_tokens,
         top_k=-1,
         top_p=args.top_p,
-        stop="\n```\n",
+        stop=stop,
     )
 
-    # state = get_model_context(args.model_key, args.model_name_or_path)
-    try:
+    if base_model_prompt:
+        print("Base model")
+    else:
         prompt_template = infer_prompt_template(
             os.getenv("TOKENIZER") or args.model_name_or_path or args.model_key
         )
-    except:
-        prompt_template = PROMPT_TEMPLATE
-    # prompt_template = PROMPT_TEMPLATE
-    print("Using:", prompt_template)
+        # prompt_template = PROMPT_TEMPLATE
+        print("Using:", prompt_template)
 
     prompts: list[str] = []
     for problem in problems:
-        prompt = prompt_template.format(
-            instruction=problem["instruction"], response=problem["response_prefix"]
-        )
+        if not base_model_prompt:
+            prompt = prompt_template.format(
+                instruction=problem["instruction"], response=problem["response_prefix"]
+            )
+        else:
+            prompt = problem["prompt"]
         prompts.append(prompt)
 
     results = engine.generate(prompts, sampling_params)
     Path(args.save_path).write_text("")
+
     step = 20
     print_or_not = [idx == 0 or idx % step == 0 for idx in range(len(problems))]
+
+    def sanitize(output: str) -> str:
+        if not base_model_prompt:
+            return output.split("```python")[-1].split("```")[0]
+        for s in stop:
+            output = output.rsplit(s, 1)[0]
+        return output
+
     for problem, prompt, result, print_debug in zip(
         problems, prompts, results, print_or_not
     ):
@@ -169,7 +204,7 @@ def main():
         samples = [
             dict(
                 task_id=problem["id"],
-                completion=output.text.split("```python")[-1].split("```")[0],
+                completion=sanitize(output.text),
             )
             for output in result.outputs
         ]
 
@@ -14,6 +14,9 @@ Design the tasks so that the relevant concepts emerge naturally as the most appr
 ### System: S->C
 Extract key programming concepts from the provided code snippet. Programming concepts refer to the foundational principles and techniques used in programming, which are crucial for developers to master. List these concepts in a comma-separated format.
 
+### System: S->I
+Gain inspiration from the given code snippets and create a series of independent coding tasks that are original, distinct, diverse, and high-quality, fostering logical thinking.
+
 ### Example 1
 [Code]
     value = int(round((value - prev) * 1e5))
 
@@ -7,3 +7,4 @@ tiktoken~=0.6.0
 accelerate>=0.27.2
 datasets>=2.17.1
 evalplus @ git+https://github.com/evalplus/evalplus.git@25e195e024b614f2671ad9ac5b8fdcd9b95a2b24#egg=evalplus
+evoeval~=0.1.0
@@ -34,11 +34,12 @@ class Args:
     max_avg_chars_per_line: int = field(default=80)
     # max_fragments: int = field(default=3)
     chunk_size: int = field(default=1000)
-    content_chunk_lines: int = field(default=100)
+    # A small value lets one document be used by multiple seeds
+    content_chunk_lines: int = field(default=99999999999)
 
     dataset_name: str = field(default="bigcode/starcoderdata")
     data_files: list[str] | None = field(default=None)
-    max_considered_data: int | None = field(default=100000)
+    max_considered_data: int | None = field(default=200000)
 
     collect_function: bool = field(default=False)
     max_nodes_to_traverse: int = field(default=20000)
 
@@ -24,6 +24,7 @@ class Args:
     data_files: list[str]
     output_file: str
     shuffle: bool = field(default=True)
+    remove_strange: bool = field(default=True)
     parse_raw_response: bool = field(default=True)
     passing_only: bool = field(default=True)
     data_augmentation: bool = field(default=False)
@@ -369,6 +370,10 @@ def mk_key(instruction: str) -> str:
 
         def iterate(dataset: Dataset):
             for d in tqdm(dataset):
+                if args.remove_strange:
+                    # NOTE: newly added
+                    if len(d["instruction"].split()) > 200:
+                        continue
                 key_i, key_r = mk_key(d["instruction"]), mk_key(d["response"])
                 if key_i in seen_keys or key_r in seen_keys:
                     continue
 
@@ -14,7 +14,7 @@
 
 import star_align
 
-InstructMode = Literal["I->R", "S->C", "C->I"]
+InstructMode = Literal["I->R", "S->C", "C->I", "S->I"]
 
 LANGUAGE_MAP = {
     "cpp": "C++",
@@ -170,6 +170,8 @@ def prefix_template(mode: InstructMode) -> str:
             return "### Snippet\n{snippet}\n\n### Concepts\n"
         elif mode == "C->I":
             return "### Properties\n{property}\n\n### Task\n"
+        elif mode == "S->I":
+            return "### Snippet\n{snippet}\n\n### Task\n"
         else:
             assert False
 
@@ -199,6 +201,9 @@ def prompt(
             # property_prompt += f"\nnum_words: {num_words}"
             kwargs = dict(property=property_prompt)
             suffix = self.instruction
+        elif mode == "S->I":
+            kwargs = dict(snippet=self.snippet)
+            suffix = self.instruction
         else:
             assert False
         prefix = self.prefix_template(mode).format(**kwargs)
@@ -213,6 +218,7 @@ class Fewshot:
     sys_i_r: str
     sys_c_i: str
     sys_s_c: str
+    sys_s_i: str
 
     examples: list[Example]
 
@@ -297,8 +303,8 @@ def get_ossinstruct_fewshots() -> Fewshot:
     splits = re.split(r"### Example \d+", content)
     system_prompt = splits[0].strip()
     # "I->R", "E->S", "I->I", "PI->PI", "S->C"
-    sys_pattern = r"### System: I->R|### System: C->I|### System: S->C"
-    _, i_r, c_i, s_c = list(map(str.strip, re.split(sys_pattern, system_prompt)))
+    sys_pattern = r"### System: I->R|### System: C->I|### System: S->C|### System: S->I"
+    _, i_r, c_i, s_c, s_i = list(map(str.strip, re.split(sys_pattern, system_prompt)))
     if LLAMA3:
         i_r = f"{i_r}\n\nFor each '## Example' below, make sure you provide a '### Response' and a '### Tests' section."
     # system_prompt = re.split(r"### System: Instruction", system_prompt)[1]
@@ -331,6 +337,7 @@ def get_ossinstruct_fewshots() -> Fewshot:
         sys_i_r=i_r,
         sys_c_i=c_i,
         sys_s_c=s_c,
+        sys_s_i=s_i,
         examples=examples,
     )
 
@@ -343,6 +350,8 @@ def parse_generated_content(content: str, instruct_mode: InstructMode) -> dict |
         return dict(concepts=concepts)
     elif instruct_mode == "C->I":
         return dict(instruction=content.strip())
+    elif instruct_mode == "S->I":
+        return dict(instruction=content.strip())
     else:
         assert False
 
@@ -352,11 +361,11 @@ def build_kwargs(instruct_mode: InstructMode, example: dict) -> dict[str, str]:
     if instruct_mode == "I->R":
         kwargs["instruction"] = example["instruction"]
         # Hack
-        category_index = example["prompt"].rindex("category: ") + len("category: ")
-        category_end = example["prompt"].index("\n", category_index)
-        category = example["prompt"][category_index:category_end].strip()
-        kwargs["category"] = category  # type: ignore
-    elif instruct_mode == "S->C":
+        # category_index = example["prompt"].rindex("category: ") + len("category: ")
+        # category_end = example["prompt"].index("\n", category_index)
+        # category = example["prompt"][category_index:category_end].strip()
+        # kwargs["category"] = category  # type: ignore
+    elif instruct_mode in ["S->C", "S->I"]:
         kwargs["snippet"] = example["seed"]
     elif instruct_mode == "C->I":
         lang = example.get("data_dir", "dummy_key_not_in_example")
 
@@ -192,17 +192,52 @@ def find_codeblock_indices(
     return all_indices
 
 
+DEFAULT_TEMPLATE = """\
+### Instruction
+{instruction}
+
+### Response
+{response}"""
+
+
+def is_base_model(tokenizer_name: str) -> bool:
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    return tokenizer.chat_template is None and "octocoder" not in tokenizer_name
+
+
+OCTOCODER_CHAT_TEMPLATE = """\
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {{ raise_exception('System messages are not allowed in this template.') }}
+    {%- else %}
+        {%- if message['role'] == 'user' %}
+{{'Question: ' + message['content'] + '\n\n'}}
+        {%- else %}
+{{'Answer: ' + message['content'] + '\n\n'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{{'Question: '}}"""
+
+
 def infer_prompt_template(tokenizer_name: str) -> str:
     from transformers import AutoTokenizer
 
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    template = tokenizer.apply_chat_template(
-        [
-            {"role": "user", "content": "{instruction}"},
-            {"role": "assistant", "content": "{response}"},
-        ],
-        tokenize=False,
-    )
+    if "octocoder" in tokenizer_name:
+        tokenizer.chat_template = OCTOCODER_CHAT_TEMPLATE
+    if tokenizer.chat_template is not None:
+        template = tokenizer.apply_chat_template(
+            [
+                {"role": "user", "content": "{instruction}"},
+                {"role": "assistant", "content": "{response}"},
+            ],
+            tokenize=False,
+        )
+    else:
+        template = DEFAULT_TEMPLATE
     end_index = template.rindex("{response}") + len("{response}")
     template = template[:end_index]
     return template