kalshi-backtest/scripts/train_ml_models.py
Nicholai 025322219c feat: initial commit with code quality refactoring
kalshi prediction market backtesting framework with:
- trading pipeline (sources, filters, scorers, selectors)
- position sizing with kelly criterion
- multiple scoring strategies (momentum, mean reversion, etc)
- random baseline for comparison

refactoring includes:
- extract shared resolve_closed_positions() function
- reduce RandomBaseline::run() nesting with helper functions
- move MarketCandidate Default impl to types.rs
- add explanatory comments to complex logic
2026-01-21 09:32:12 -07:00

281 lines
9.4 KiB
Python

#!/usr/bin/env python3
"""
Train ML models for the kalshi backtest framework.
Models:
- LSTM: learns patterns from price history sequences
- MLP: learns optimal combination of hand-crafted features
Usage:
python scripts/train_ml_models.py --data data/trades.csv --output models/
"""
import argparse
import json
import numpy as np
import pandas as pd
from pathlib import Path
try:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
HAS_TORCH = True
except ImportError:
HAS_TORCH = False
print("warning: pytorch not installed. run: pip install torch")
def parse_args():
parser = argparse.ArgumentParser(description="Train ML models for kalshi backtest")
parser.add_argument("--data", type=Path, default=Path("data/trades.csv"))
parser.add_argument("--markets", type=Path, default=Path("data/markets.csv"))
parser.add_argument("--output", type=Path, default=Path("models"))
parser.add_argument("--epochs", type=int, default=50)
parser.add_argument("--batch-size", type=int, default=64)
parser.add_argument("--seq-len", type=int, default=24)
parser.add_argument("--train-split", type=float, default=0.8)
return parser.parse_args()
class LSTMPredictor(nn.Module):
def __init__(self, input_size=1, hidden_size=128, num_layers=2, dropout=0.2):
super().__init__()
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=dropout,
batch_first=True,
)
self.fc = nn.Linear(hidden_size, 1)
self.tanh = nn.Tanh()
def forward(self, x):
lstm_out, _ = self.lstm(x)
last_output = lstm_out[:, -1, :]
out = self.fc(last_output)
return self.tanh(out)
class MLPPredictor(nn.Module):
def __init__(self, input_size=7, hidden_sizes=[64, 32]):
super().__init__()
layers = []
prev_size = input_size
for h in hidden_sizes:
layers.append(nn.Linear(prev_size, h))
layers.append(nn.ReLU())
layers.append(nn.Dropout(0.2))
prev_size = h
layers.append(nn.Linear(prev_size, 1))
layers.append(nn.Tanh())
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
def load_data(trades_path: Path, markets_path: Path, seq_len: int):
print(f"loading trades from {trades_path}...")
trades = pd.read_csv(trades_path)
trades["timestamp"] = pd.to_datetime(trades["timestamp"])
trades = trades.sort_values(["ticker", "timestamp"])
print(f"loading markets from {markets_path}...")
markets = pd.read_csv(markets_path)
markets["close_time"] = pd.to_datetime(markets["close_time"])
result_map = dict(zip(markets["ticker"], markets["result"]))
sequences = []
features = []
labels = []
for ticker, group in trades.groupby("ticker"):
result = result_map.get(ticker)
if result not in ["yes", "no"]:
continue
label = 1.0 if result == "yes" else -1.0
prices = group["price"].values / 100.0
volumes = group["volume"].values
taker_sides = (group["taker_side"] == "yes").astype(float).values
if len(prices) < seq_len:
continue
for i in range(seq_len, len(prices)):
seq = prices[i - seq_len : i]
log_returns = np.diff(np.log(np.clip(seq, 1e-6, 1.0)))
if len(log_returns) == seq_len - 1:
log_returns = np.pad(log_returns, (1, 0), mode="constant")
sequences.append(log_returns)
curr_price = prices[i - 1]
momentum = prices[i - 1] - prices[i - seq_len] if len(prices) > seq_len else 0
mean_price = np.mean(prices[i - seq_len : i])
mean_reversion = mean_price - curr_price
vol_sum = np.sum(volumes[i - seq_len : i])
buy_vol = np.sum(volumes[i - seq_len : i] * taker_sides[i - seq_len : i])
sell_vol = vol_sum - buy_vol
order_flow = (buy_vol - sell_vol) / max(vol_sum, 1)
feat = [
momentum,
mean_reversion,
np.log1p(vol_sum),
order_flow,
curr_price,
np.std(log_returns) if len(log_returns) > 1 else 0,
len(group) / 1000.0,
]
features.append(feat)
labels.append(label)
print(f"created {len(sequences)} training samples")
return np.array(sequences), np.array(features), np.array(labels)
def train_lstm(sequences, labels, args):
print("\n" + "=" * 50)
print("Training LSTM")
print("=" * 50)
n = len(sequences)
split = int(n * args.train_split)
X_train = torch.tensor(sequences[:split], dtype=torch.float32).unsqueeze(-1)
y_train = torch.tensor(labels[:split], dtype=torch.float32).unsqueeze(-1)
X_test = torch.tensor(sequences[split:], dtype=torch.float32).unsqueeze(-1)
y_test = torch.tensor(labels[split:], dtype=torch.float32).unsqueeze(-1)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
model = LSTMPredictor(input_size=1, hidden_size=128, num_layers=2)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(args.epochs):
model.train()
total_loss = 0
for X_batch, y_batch in train_loader:
optimizer.zero_grad()
output = model(X_batch)
loss = criterion(output, y_batch)
loss.backward()
optimizer.step()
total_loss += loss.item()
if (epoch + 1) % 10 == 0:
model.set_mode_to_inference()
with torch.no_grad():
train_pred = model(X_train)
test_pred = model(X_test)
train_acc = ((train_pred > 0) == (y_train > 0)).float().mean()
test_acc = ((test_pred > 0) == (y_test > 0)).float().mean()
print(f"epoch {epoch + 1}/{args.epochs}: loss={total_loss/len(train_loader):.4f}, train_acc={train_acc:.3f}, test_acc={test_acc:.3f}")
return model
def train_mlp(features, labels, args):
print("\n" + "=" * 50)
print("Training MLP")
print("=" * 50)
features = (features - features.mean(axis=0)) / (features.std(axis=0) + 1e-8)
n = len(features)
split = int(n * args.train_split)
X_train = torch.tensor(features[:split], dtype=torch.float32)
y_train = torch.tensor(labels[:split], dtype=torch.float32).unsqueeze(-1)
X_test = torch.tensor(features[split:], dtype=torch.float32)
y_test = torch.tensor(labels[split:], dtype=torch.float32).unsqueeze(-1)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
model = MLPPredictor(input_size=features.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(args.epochs):
model.train()
total_loss = 0
for X_batch, y_batch in train_loader:
optimizer.zero_grad()
output = model(X_batch)
loss = criterion(output, y_batch)
loss.backward()
optimizer.step()
total_loss += loss.item()
if (epoch + 1) % 10 == 0:
model.set_mode_to_inference()
with torch.no_grad():
train_pred = model(X_train)
test_pred = model(X_test)
train_acc = ((train_pred > 0) == (y_train > 0)).float().mean()
test_acc = ((test_pred > 0) == (y_test > 0)).float().mean()
print(f"epoch {epoch + 1}/{args.epochs}: loss={total_loss/len(train_loader):.4f}, train_acc={train_acc:.3f}, test_acc={test_acc:.3f}")
return model
def export_onnx(model, output_path: Path, input_shape, input_name="input", output_name="output"):
model.set_mode_to_inference()
dummy_input = torch.randn(*input_shape)
torch.onnx.export(
model,
dummy_input,
output_path,
input_names=[input_name],
output_names=[output_name],
dynamic_axes={
input_name: {0: "batch_size"},
output_name: {0: "batch_size"},
},
opset_version=14,
)
print(f"exported to {output_path}")
def main():
args = parse_args()
if not HAS_TORCH:
print("error: pytorch required for training. install with: pip install torch")
return 1
if not args.data.exists():
print(f"error: data file not found: {args.data}")
return 1
if not args.markets.exists():
print(f"error: markets file not found: {args.markets}")
return 1
args.output.mkdir(parents=True, exist_ok=True)
sequences, features, labels = load_data(args.data, args.markets, args.seq_len)
if len(sequences) < 100:
print(f"error: not enough training data ({len(sequences)} samples)")
return 1
lstm_model = train_lstm(sequences, labels, args)
export_onnx(lstm_model, args.output / "lstm.onnx", (1, args.seq_len, 1))
mlp_model = train_mlp(features, labels, args)
export_onnx(mlp_model, args.output / "mlp.onnx", (1, features.shape[1]))
print("\n" + "=" * 50)
print("Training complete!")
print(f"Models saved to: {args.output}")
print("=" * 50)
return 0
if __name__ == "__main__":
exit(main())