Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.monostate.ai/llms.txt

Use this file to discover all available pages before exploring further.

Custom Scripts

Extend AITraining with custom preprocessing and postprocessing scripts.

Requirements

Custom scripts can use libraries bundled with AITraining:
  • Python >= 3.8
  • pandas - Data manipulation
  • torch - PyTorch for model operations
  • transformers - Hugging Face models
  • peft - LoRA/adapter operations
These are installed automatically with AITraining.

Data Preprocessing

Custom Data Pipeline

# prepare_data.py
import json
import pandas as pd
from pathlib import Path

def load_and_clean(input_path):
    """Load and clean raw data."""
    df = pd.read_csv(input_path)

    # Remove empty rows
    df = df.dropna(subset=['text'])

    # Clean text
    df['text'] = df['text'].str.strip()
    df = df[df['text'].str.len() > 10]

    return df

def convert_to_chat_format(df, text_col='text'):
    """Convert to chat format for SFT."""
    conversations = []

    for _, row in df.iterrows():
        conversations.append({
            "messages": [
                {"role": "user", "content": "Continue this text:"},
                {"role": "assistant", "content": row[text_col]}
            ]
        })

    return conversations

def main():
    # Load data
    df = load_and_clean("raw_data.csv")

    # Convert to chat format
    conversations = convert_to_chat_format(df)

    # Save
    output_path = Path("processed_data.jsonl")
    with open(output_path, 'w') as f:
        for conv in conversations:
            f.write(json.dumps(conv) + '\n')

    print(f"Saved {len(conversations)} conversations")

if __name__ == "__main__":
    main()
Usage:
python prepare_data.py
aitraining llm --train --data-path ./processed_data.jsonl ...

DPO Data Preparation

# prepare_dpo_data.py
import json
from pathlib import Path

def create_dpo_pairs(preference_data):
    """Convert preference data to DPO format."""
    dpo_data = []

    for item in preference_data:
        dpo_data.append({
            "prompt": item["question"],
            "chosen": item["preferred_answer"],
            "rejected": item["rejected_answer"]
        })

    return dpo_data

def main():
    # Load preference annotations
    with open("preferences.json") as f:
        data = json.load(f)

    # Convert to DPO format
    dpo_data = create_dpo_pairs(data)

    # Save
    with open("dpo_data.jsonl", 'w') as f:
        for item in dpo_data:
            f.write(json.dumps(item) + '\n')

    print(f"Created {len(dpo_data)} DPO pairs")

if __name__ == "__main__":
    main()

Evaluation Scripts

Custom Evaluation

# evaluate_model.py
import argparse
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_path):
    """Load trained model."""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return model, tokenizer

def evaluate_prompts(model, tokenizer, prompts):
    """Generate responses for evaluation prompts."""
    results = []

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=100)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        results.append({
            "prompt": prompt,
            "response": response
        })

    return results

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", required=True)
    parser.add_argument("--prompts-file", default="eval_prompts.txt")
    args = parser.parse_args()

    # Load model
    model, tokenizer = load_model(args.model_path)

    # Load prompts
    with open(args.prompts_file) as f:
        prompts = [line.strip() for line in f if line.strip()]

    # Evaluate
    results = evaluate_prompts(model, tokenizer, prompts)

    # Save results
    with open("eval_results.json", 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Evaluated {len(results)} prompts")

if __name__ == "__main__":
    main()

Benchmark Script

# benchmark.py
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def benchmark_model(model_path, prompts, num_runs=5):
    """Benchmark model inference speed."""
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    if torch.cuda.is_available():
        model = model.cuda()

    times = []

    for _ in range(num_runs):
        start = time.time()

        for prompt in prompts:
            inputs = tokenizer(prompt, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=50)

        times.append(time.time() - start)

    avg_time = sum(times) / len(times)
    tokens_per_sec = len(prompts) * 50 / avg_time

    return {
        "avg_time": avg_time,
        "tokens_per_second": tokens_per_sec
    }

Post-Processing Scripts

Model Merging

# merge_lora.py
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import argparse

def merge_and_save(base_model, adapter_path, output_path):
    """Merge LoRA adapter into base model."""
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(base_model)
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    # Load and merge adapter
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()

    # Save merged model
    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)

    print(f"Merged model saved to {output_path}")

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-model", required=True)
    parser.add_argument("--adapter-path", required=True)
    parser.add_argument("--output-path", required=True)
    args = parser.parse_args()

    merge_and_save(args.base_model, args.adapter_path, args.output_path)

if __name__ == "__main__":
    main()
Or use the built-in tool:
aitraining tools merge-llm-adapter \
  --base-model-path google/gemma-3-270m \
  --adapter-path ./my-lora-model \
  --output-folder ./merged-model
The built-in tool also supports --token, --pad-to-multiple-of, and --push-to-hub flags. You must specify either --output-folder or --push-to-hub.

Integration Example

Full Training Script

#!/bin/bash
# train_with_scripts.sh

set -e

# 1. Prepare data
echo "Preparing data..."
python scripts/prepare_data.py

# 2. Train model
echo "Training..."
aitraining llm --train \
  --model google/gemma-3-270m \
  --data-path processed/data.jsonl \
  --project-name my-model \
  --peft

# 3. Merge adapter
echo "Merging adapter..."
python scripts/merge_lora.py \
  --base-model google/gemma-3-270m \
  --adapter-path my-model \
  --output-path my-model-merged

# 4. Evaluate
echo "Evaluating..."
python scripts/evaluate_model.py \
  --model-path my-model-merged

echo "Done!"

Next Steps

Pipeline Automation

Full pipeline examples

Logging & Debugging

Monitor your scripts