Files
digital-forensics-lab/IP_Extraction_Fine_Tuning/code/preprocess_dataset.py
2023-08-13 10:01:43 -04:00

60 lines
1.7 KiB
Python

import pandas as pd
import json
import os
# Read the CSV file into a DataFrame
file_path = os.path.join(os.getcwd(), "./openAI/dataset_small.csv")
data = pd.read_csv(file_path)
# Create a list to store the converted data
json_list = []
# Iterate through each row and create the JSON format
for index, row in data.iterrows():
promote_text = (
row["text"] + ";" + row["ip"]
) # Replace with the actual column name for "promote" text
completion_text = (
"positive" if row["class"] != "N" else "negtive"
) # Replace with the actual column name for "completion" text
# Create a dictionary for the JSON object
json_object = {"promote": promote_text, "completion": completion_text}
json_list.append(json_object)
# Write the converted data to a JSON file
output_file_path = os.path.join(
os.getcwd(), "./openAI/dataset_small.json"
) # Replace with the desired output file path
# Calculate the split index
split_index = int(0.8 * len(json_list))
# Split the list into two parts
train_data = json_list[:split_index]
test_data = json_list[split_index:]
# File paths for train and test data
train_file_path = os.path.join(os.getcwd(), "./openAI/dataset_train.jsonl")
test_file_path = os.path.join(os.getcwd(), "./openAI/dataset_test.jsonl")
# Write train data to file
with open(train_file_path, "w") as train_file:
for json_object in train_data:
line = json.dumps(json_object) + "\n"
train_file.write(line)
# Write test data to file
with open(test_file_path, "w") as test_file:
for json_object in test_data:
line = json.dumps(json_object) + "\n"
test_file.write(line)
print(
"JSON objects have been split (80% \and 20%) and written to train_data.jsonl and test_data.jsonl"
)