mirror of
https://github.com/frankwxu/digital-forensics-lab.git
synced 2026-04-10 12:13:44 +00:00
add IP extraction code and small dataset
This commit is contained in:
59
IP_Extraction_Fine_Tuning/code/preprocess_dataset.py
Normal file
59
IP_Extraction_Fine_Tuning/code/preprocess_dataset.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
# Read the CSV file into a DataFrame
|
||||
file_path = os.path.join(os.getcwd(), "./openAI/dataset_small.csv")
|
||||
|
||||
data = pd.read_csv(file_path)
|
||||
|
||||
# Create a list to store the converted data
|
||||
json_list = []
|
||||
|
||||
# Iterate through each row and create the JSON format
|
||||
for index, row in data.iterrows():
|
||||
promote_text = (
|
||||
row["text"] + ";" + row["ip"]
|
||||
) # Replace with the actual column name for "promote" text
|
||||
completion_text = (
|
||||
"positive" if row["class"] != "N" else "negtive"
|
||||
) # Replace with the actual column name for "completion" text
|
||||
|
||||
# Create a dictionary for the JSON object
|
||||
json_object = {"promote": promote_text, "completion": completion_text}
|
||||
|
||||
json_list.append(json_object)
|
||||
|
||||
# Write the converted data to a JSON file
|
||||
output_file_path = os.path.join(
|
||||
os.getcwd(), "./openAI/dataset_small.json"
|
||||
) # Replace with the desired output file path
|
||||
|
||||
|
||||
# Calculate the split index
|
||||
split_index = int(0.8 * len(json_list))
|
||||
|
||||
# Split the list into two parts
|
||||
train_data = json_list[:split_index]
|
||||
test_data = json_list[split_index:]
|
||||
|
||||
# File paths for train and test data
|
||||
train_file_path = os.path.join(os.getcwd(), "./openAI/dataset_train.jsonl")
|
||||
test_file_path = os.path.join(os.getcwd(), "./openAI/dataset_test.jsonl")
|
||||
|
||||
# Write train data to file
|
||||
with open(train_file_path, "w") as train_file:
|
||||
for json_object in train_data:
|
||||
line = json.dumps(json_object) + "\n"
|
||||
train_file.write(line)
|
||||
|
||||
# Write test data to file
|
||||
with open(test_file_path, "w") as test_file:
|
||||
for json_object in test_data:
|
||||
line = json.dumps(json_object) + "\n"
|
||||
test_file.write(line)
|
||||
|
||||
print(
|
||||
"JSON objects have been split (80% \and 20%) and written to train_data.jsonl and test_data.jsonl"
|
||||
)
|
||||
Reference in New Issue
Block a user