digital-forensics-lab/IP_Extraction_Fine_Tuning/code/preprocess_dataset.py

import pandas as pd
import json
import os


# Read the CSV file into a DataFrame
file_path = os.path.join(os.getcwd(), "./openAI/dataset_small.csv")

data = pd.read_csv(file_path)

# Create a list to store the converted data
json_list = []

# Iterate through each row and create the JSON format
for index, row in data.iterrows():
    promote_text = (
        row["text"] + ";" + row["ip"]
    )  # Replace with the actual column name for "promote" text
    completion_text = (
        "positive" if row["class"] != "N" else "negtive"
    )  # Replace with the actual column name for "completion" text

    # Create a dictionary for the JSON object
    json_object = {"promote": promote_text, "completion": completion_text}

    json_list.append(json_object)

# Write the converted data to a JSON file
output_file_path = os.path.join(
    os.getcwd(), "./openAI/dataset_small.json"
)  # Replace with the desired output file path


# Calculate the split index
split_index = int(0.8 * len(json_list))

# Split the list into two parts
train_data = json_list[:split_index]
test_data = json_list[split_index:]

# File paths for train and test data
train_file_path = os.path.join(os.getcwd(), "./openAI/dataset_train.jsonl")
test_file_path = os.path.join(os.getcwd(), "./openAI/dataset_test.jsonl")

# Write train data to file
with open(train_file_path, "w") as train_file:
    for json_object in train_data:
        line = json.dumps(json_object) + "\n"
        train_file.write(line)

# Write test data to file
with open(test_file_path, "w") as test_file:
    for json_object in test_data:
        line = json.dumps(json_object) + "\n"
        test_file.write(line)

print(
    "JSON objects have been split (80% \and 20%) and written to train_data.jsonl and test_data.jsonl"
)