Skip to main content

Get LLM Finetuning Data

The following are examples of training data in different formats.

from datasets import load_dataset
from superduper.base.document import Document
dataset_name = "timdettmers/openassistant-guanaco"
dataset = load_dataset(dataset_name)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

train_documents = [
Document({**example, "_fold": "train"})
for example in train_dataset
]
eval_documents = [
Document({**example, "_fold": "valid"})
for example in eval_dataset
]

datas = train_documents + eval_documents

We can define different training parameters to handle this type of data.

# Function for transformation after extracting data from the database
transform = None
key = ('text')
training_kwargs=dict(dataset_text_field="text")

Example input_text and output_text

data = datas[0]
input_text, output_text = data["text"].rsplit("### Assistant: ", maxsplit=1)
input_text += "### Assistant: "
output_text = output_text.rsplit("### Human:")[0]
print("Input: --------------")
print(input_text)
print("Response: --------------")
print(output_text)