Implements PyTorch training patterns, data loading, and GPU optimization. Covers tensor operations, DataLoader/Dataset classes, training loops, CUDA memory management, and debugging common errors. Use when writing PyTorch code, debugging tensor shape mismatches, fixing CUDA OOM errors, optimizing training speed, or when user mentions 'PyTorch', 'tensor', 'DataLoader', 'training loop', 'GPU memory', 'CUDA', '.to(device)', 'model.eval()', 'torch.no_grad()', 'shape mismatch', 'environment setup', 'nvidia-smi', or 'CUDA setup'.
Resources
1Install
npx skillscat add levy-n/claude-useful-skills/pytorch-mastery Install via the SkillsCat registry.
SKILL.md
PyTorch Mastery - Practical Deep Learning
PyTorch patterns, debugging, ו-GPU optimization.
Quick Start - Complete Training Pipeline
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Data
dataset = TensorDataset(torch.tensor(X).float(), torch.tensor(y).float())
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
# Model
model = nn.Sequential(
nn.Linear(input_dim, 64),
nn.ReLU(),
nn.Linear(64, 1)
).to(device)
# Training
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(100):
model.train()
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print(f"Epoch {epoch}, Loss: {loss.item():.4f}")When This Skill Activates
Use this skill when:
- Writing PyTorch training code
- Creating custom Dataset/DataLoader
- Debugging tensor shape mismatches
- Fixing CUDA out of memory errors
- Optimizing GPU utilization
- Saving/loading model checkpoints
Core Patterns
Pattern 1: Tensor Shapes (NCHW Format)
# Image tensors: [Batch, Channels, Height, Width]
images = torch.randn(32, 3, 224, 224)
# │ │ │ │
# │ │ │ └── Width
# │ │ └── Height
# │ └── Channels (RGB=3, Grayscale=1)
# └── Batch size
# Feature tensors: [Batch, Features]
features = torch.randn(32, 128)
# │ │
# │ └── Number of features
# └── Batch size
# Sequence tensors: [Batch, Sequence, Features]
sequences = torch.randn(32, 100, 256)
# │ │ │
# │ │ └── Feature/embedding dim
# │ └── Sequence length
# └── Batch sizePattern 2: Custom Dataset Class
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
def __init__(self, X, y, transform=None):
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.float32)
self.transform = transform
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
x = self.X[idx]
y = self.y[idx]
if self.transform:
x = self.transform(x)
return x, y
# Usage
dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(
dataset,
batch_size=32,
shuffle=True, # Shuffle for training
num_workers=4, # Parallel loading
pin_memory=True # Faster GPU transfer
)Pattern 3: DataLoader Parameters
# Training loader (shuffle, augmentation)
train_loader = DataLoader(
train_dataset,
batch_size=32,
shuffle=True, # Random order each epoch
num_workers=4, # Parallel data loading
pin_memory=True, # Faster CPU→GPU transfer
drop_last=True # Drop incomplete last batch
)
# Validation/Test loader (no shuffle, no augmentation)
val_loader = DataLoader(
val_dataset,
batch_size=64, # Can be larger (no gradients)
shuffle=False, # Keep order for reproducibility
num_workers=4,
pin_memory=True
)Pattern 4: Device Management
# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Move model to device
model = model.to(device)
# Move data to device (in training loop)
for X_batch, y_batch in train_loader:
X_batch = X_batch.to(device)
y_batch = y_batch.to(device)
# ... training code
# Check model device
print(next(model.parameters()).device)
# Multi-GPU (DataParallel)
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)Pattern 5: Training Loop with Validation
def train_epoch(model, train_loader, criterion, optimizer, device):
model.train()
total_loss = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(train_loader)
def validate(model, val_loader, criterion, device):
model.eval()
total_loss = 0
with torch.no_grad(): # No gradient computation
for X_batch, y_batch in val_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
total_loss += loss.item()
return total_loss / len(val_loader)
# Full training
best_val_loss = float('inf')
for epoch in range(num_epochs):
train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
val_loss = validate(model, val_loader, criterion, device)
print(f"Epoch {epoch}: Train={train_loss:.4f}, Val={val_loss:.4f}")
# Save best model
if val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(model.state_dict(), 'best_model.pt')Pattern 6: Save/Load Checkpoints
# Save checkpoint (full state)
checkpoint = {
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
'best_val_loss': best_val_loss
}
torch.save(checkpoint, 'checkpoint.pt')
# Load checkpoint
checkpoint = torch.load('checkpoint.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']
# Save just model weights
torch.save(model.state_dict(), 'model_weights.pt')
# Load just model weights
model.load_state_dict(torch.load('model_weights.pt'))
model.eval() # Set to evaluation modePattern 7: GPU Memory Management
# Check GPU memory
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
# Clear cache
torch.cuda.empty_cache()
# Gradient checkpointing (trade compute for memory)
from torch.utils.checkpoint import checkpoint
# Use in forward: output = checkpoint(self.layer, input)
# Delete tensors explicitly
del large_tensor
torch.cuda.empty_cache()
# Reduce batch size if OOM
# Or use gradient accumulation:
accumulation_steps = 4
optimizer.zero_grad()
for i, (X, y) in enumerate(train_loader):
loss = criterion(model(X), y) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()Pattern 8: Model Accuracy Function
def model_accuracy(model, dataloader):
"""Calculate accuracy with proper eval mode."""
model.eval()
device = next(model.parameters()).device # Auto-detect device
correct = 0
total = 0
with torch.no_grad():
for X_batch, y_batch in dataloader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
# For classification
_, predicted = torch.max(outputs, 1)
total += y_batch.size(0)
correct += (predicted == y_batch).sum().item()
return correct / total # Returns Python floatReference Navigation
For detailed content, see:
- Tensors & Operations:
reference/tensors_operations.md- Creation, Broadcasting, NCHW format - Training Patterns:
reference/training_patterns.md- Loops, Validation, Early stopping - Memory Management:
reference/memory_management.md- GPU optimization, Gradient accumulation - Debugging Tips:
reference/debugging_tips.md- Shape errors, CUDA issues - Code Templates:
reference/code_templates.md- Ready-to-use snippets - Environment Setup:
reference/environment_setup.md- GPU/CUDA verification, System health check
Common Mistakes to Avoid
1. Forgetting model.eval() for Inference
# WRONG: Dropout still active, BatchNorm uses batch stats
predictions = model(X_test)
# CORRECT: Switch to eval mode
model.eval()
with torch.no_grad():
predictions = model(X_test)2. Not Using torch.no_grad() for Inference
# WRONG: Wastefully computes gradients
model.eval()
predictions = model(X_test) # Still tracking gradients!
# CORRECT: Disable gradient computation
model.eval()
with torch.no_grad(): # Saves memory and time
predictions = model(X_test)3. Shape Mismatch Debugging
# Add shape prints for debugging
print(f"Input shape: {x.shape}")
x = self.conv1(x)
print(f"After conv1: {x.shape}")
x = self.pool(x)
print(f"After pool: {x.shape}")
x = x.view(x.size(0), -1) # Flatten
print(f"After flatten: {x.shape}")
# Common fix: Wrong flatten size
# Calculate: channels × height × width after convolutions4. GPU/CPU Tensor Mismatch
# WRONG: Mixing devices
model = model.cuda()
X = torch.randn(32, 10) # On CPU!
output = model(X) # Error!
# CORRECT: Same device
model = model.to(device)
X = X.to(device)
output = model(X)5. In-place Operations Breaking Gradients
# WRONG: In-place modification
x = model(input)
x += 1 # In-place! May break gradient computation
# CORRECT: Create new tensor
x = model(input)
x = x + 1 # New tensor6. Not Converting NumPy to Tensor Properly
# WRONG: Wrong dtype
X_tensor = torch.tensor(X_numpy) # May be float64
# CORRECT: Specify dtype
X_tensor = torch.tensor(X_numpy, dtype=torch.float32)
# Or
X_tensor = torch.from_numpy(X_numpy).float()Teaching Mode
When explaining PyTorch concepts:
Tensor Visual
Tensor = N-dimensional array
0D: scalar 42
1D: vector [1, 2, 3]
2D: matrix [[1,2], [3,4]]
3D: volume [[[1,2],[3,4]], [[5,6],[7,8]]]
4D: batch+3D Images: [batch, channels, height, width]Training Loop Visual
┌─────────────────────────────────────────┐
│ TRAINING LOOP │
├─────────────────────────────────────────┤
│ │
│ for epoch in epochs: │
│ for batch in dataloader: │
│ │
│ ① optimizer.zero_grad() │
│ ↓ │
│ ② output = model(X) Forward │
│ ↓ │
│ ③ loss = criterion(output, y) │
│ ↓ │
│ ④ loss.backward() Backward│
│ ↓ │
│ ⑤ optimizer.step() Update │
│ │
└─────────────────────────────────────────┘GPU Memory Analogy
GPU Memory = Hotel
- Tensors = Guests checking in
- .to(device) = Booking a room
- .cpu() = Checking out
- torch.cuda.empty_cache() = Cleaning empty rooms
- OOM Error = Hotel full!
Solutions:
- Smaller batch = fewer guests at once
- Gradient accumulation = guests take turns
- del tensor = force checkout