8000 xtuner (#5) · InternLM/Tutorial@438fbee · GitHub
[go: up one dir, main page]

Skip to content

Commit 438fbee

Browse files
authored
xtuner (#5)
1 parent 678d057 commit 438fbee

19 files changed

+13162
-0
lines changed

xtuner/MedQA2019-structured-test.jsonl

Lines changed: 1874 additions & 0 deletions
Large diffs are not rendered by default.

xtuner/MedQA2019-structured-train.jsonl

Lines changed: 4340 additions & 0 deletions
Large diffs are not rendered by default.

xtuner/MedQA2019-structured.jsonl

Lines changed: 6212 additions & 0 deletions
Large diffs are not rendered by default.

xtuner/MedQA2019.xlsx

160 KB
Binary file not shown.

xtuner/README.md

Lines changed: 666 additions & 0 deletions
Large diffs are not rendered by default.

xtuner/imgs/afterFT.png

48.4 KB
Loading

xtuner/imgs/beforeFT.png

93.1 KB
Loading

xtuner/imgs/bugfix1.png

27.4 KB
Loading

xtuner/imgs/bugfix2.png

16.1 KB
Loading

xtuner/imgs/cat_fly.png

29.8 KB
Loading

xtuner/imgs/cfgs.png

152 KB
Loading

xtuner/imgs/dataProcessed.png

360 KB
Loading

xtuner/imgs/head.png

426 KB
Loading

xtuner/imgs/medqa2019samples.png

108 KB
Loading

xtuner/imgs/msagent_data.png

111 KB
Loading

xtuner/imgs/serper.png

18.3 KB
Loading

xtuner/imgs/ysqd.png

50.2 KB
Loading

xtuner/split2train_and_test.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import json
2+
import random
3+
4+
def split_conversations(input_file, train_output_file, test_output_file):
5+
# Read the input JSONL file
6+
with open(input_file, 'r', encoding='utf-8') as jsonl_file:
7+
data = json.load(jsonl_file)
8+
9+
# Count the number of conversation elements
10+
num_conversations = len(data)
11+
12+
# Shuffle the data randomly
13+
random.shuffle(data)
14+
random.shuffle(data)
15+
random.shuffle(data)
16+
17+
# Calculate the split points for train and test
18+
split_point = int(num_conversations * 0.7)
19+
20+
# Split the data into train and test
21+
train_data = data[:split_point]
22+
test_data = data[split_point:]
23+
24+
# Write the train data to a new JSONL file
25+
with open(train_output_file, 'w', encoding='utf-8') as train_jsonl_file:
26+
json.dump(train_data, train_jsonl_file, indent=4)
27+
28+
# Write the test data to a new JSONL file
29+
with open(test_output_file, 'w', encoding='utf-8') as test_jsonl_file:
30+
json.dump(test_data, test_jsonl_file, indent=4)
31+
32+
print(f"Split complete. Train data written to {train_output_file}, Test data written to {test_output_file}")
33+
34+
# Replace 'input.jsonl', 'train.jsonl', and 'test.jsonl' with your actual file names
35+
split_conversations('MedQA2019-structured.jsonl', 'MedQA2019-structured-train.jsonl', 'MedQA2019-structured-test.jsonl')

xtuner/xlsx2jsonl.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import openpyxl
2+
import json
3+
4+
def process_excel_to_json(input_file, output_file):
5+
# Load the workbook
6+
wb = openpyxl.load_workbook(input_file)
7+
8+
# Select the "DrugQA" sheet
9+
sheet = wb["DrugQA"]
10+
11+
# Initialize the output data structure
12+
output_data = []
13+
14+
# Iterate through each row in column A and D
15+
for row in sheet.iter_rows(min_row=2, max_col=4, values_only=True):
16+
system_value = "You are a professional, highly experienced doctor professor. You always provide accurate, comprehensive, and detailed answers based on the patients' questions."
17+
18+
# Create the conversation dictionary
19+
conversation = {
20+
"system": system_value,
21+
"input": row[0],
22+
"output": row[3]
23+
}
24+
25+
# Append the conversation to the output data
26+
output_data.append({"conversation": [conversation]})
27+
28+
# Write the output data to a JSON file
29+
with open(output_file, 'w', encoding='utf-8') as json_file:
30+
json.dump(output_data, json_file, indent=4)
31+
32+
print(f"Conversion complete. Output written to {output_file}")
33+
34+
# Replace 'MedQA2019.xlsx' and 'output.jsonl' with your actual input and output file names
35+
process_excel_to_json('MedQA2019.xlsx', 'output.jsonl')

0 commit comments

Comments
 (0)
0