Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions finetuning/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
venv/
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
*.so
*.egg
*.egg-info/
dist/
build/
pii-redactor-fused/
pii-redactor-finetune.tar
data/
.DS_Store
51 changes: 51 additions & 0 deletions finetuning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# PII Redactor - Setup & Usage

## Installation

# 1. Clone repository
git clone https://github.com/YOUR_USERNAME/pii-redactor.git
cd pii-redactor

# 2. Create virtual environment (recommended)
python3 -m venv venv
source venv/bin/activate

# 3. Install dependencies
pip install mlx mlx-lm

## Usage

# Basic usage
./redact.py "Contact John Doe at john@example.com or call 555-123-4567"

# From stdin
echo "Server at 192.168.1.100" | ./redact.py

# From file
cat ticket.txt | ./redact.py

## Expected Output

Input:
"Contact John Doe at john@example.com or call 555-123-4567"

Output:
{
"redacted_text": "Contact John [LastName1] at [Email2] or call [Phone3]",
"properties_redacted": {
"lastname1": "Doe",
"email2": "john@example.com",
"phone3": "555-123-4567"
}
}

## Requirements

- Mac with Apple Silicon (M1/M2/M3/M4)
- Python 3.9+
- 8GB+ RAM

## First Run

- Downloads base model (~700MB) - takes 2-3 minutes
- Subsequent runs are instant
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
40 changes: 40 additions & 0 deletions finetuning/adapters/pii-redactor/adapter_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"adapter_path": "adapters/pii-redactor",
"batch_size": 4,
"config": null,
"data": "data",
"fine_tune_type": "lora",
"grad_accumulation_steps": 1,
"grad_checkpoint": false,
"iters": 1000,
"learning_rate": 1e-05,
"lora_parameters": {
"rank": 8,
"dropout": 0.0,
"scale": 20.0
},
"lr_schedule": null,
"mask_prompt": false,
"max_seq_length": 2048,
"model": "mlx-community/Llama-3.2-1B-Instruct-4bit",
"num_layers": 8,
"optimizer": "adam",
"optimizer_config": {
"adam": {},
"adamw": {},
"muon": {},
"sgd": {},
"adafactor": {}
},
"project_name": null,
"report_to": null,
"resume_adapter_file": null,
"save_every": 100,
"seed": 0,
"steps_per_eval": 100,
"steps_per_report": 10,
"test": false,
"test_batches": 500,
"train": true,
"val_batches": 25
}
Binary file not shown.
31 changes: 31 additions & 0 deletions finetuning/redact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Simple PII redactor"""
import sys
from mlx_lm import load, generate

print("Loading model...", file=sys.stderr)
model, tokenizer = load(
"mlx-community/Llama-3.2-1B-Instruct-4bit",
adapter_path="adapters/pii-redactor"
)

if len(sys.argv) > 1:
text = " ".join(sys.argv[1:])
else:
text = sys.stdin.read().strip()

system_prompt = """You are a privacy compliance officer. Redact PII and output JSON:
{
"redacted_text": "text with [LastNameN], [EmailN], [PhoneN], [AddressN], [IPN] placeholders",
"properties_redacted": {"lastname1": "original", "email1": "original", ...}
}"""

messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Redact all PII from this text and output in JSON format:\n\n{text}"}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

result = generate(model, tokenizer, prompt=prompt, max_tokens=512)

print(result)