Implements sequence models for time series and text. Covers RNN fundamentals, LSTM/GRU architectures, time series forecasting, text generation with language models, and sequence classification. Use when working with sequential data, predicting time series, text generation, or when user mentions 'RNN', 'LSTM', 'GRU', 'vanishing gradient', 'hidden state', 'time series', 'sequence-to-sequence', 'text generation', 'next word prediction', or 'recurrent neural network'.
Resources
1Install
npx skillscat add levy-n/claude-useful-skills/sequence-models Install via the SkillsCat registry.
SKILL.md
Sequence Models - RNN, LSTM & Time Series
מודלים לסדרות: RNN, LSTM, חיזוי סדרות זמן, ויצירת טקסט.
Quick Start - LSTM for Time Series
import torch
import torch.nn as nn
class LSTMPredictor(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=1, output_size=1):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# x shape: [batch, seq_len, features]
lstm_out, (h_n, c_n) = self.lstm(x)
# Use last hidden state
out = self.fc(h_n[-1]) # [batch, output_size]
return out
# Usage
model = LSTMPredictor(input_size=1, hidden_size=64, num_layers=2)
X = torch.randn(32, 100, 1) # [batch, seq_len, features]
output = model(X) # [32, 1]When This Skill Activates
Use this skill when:
- Working with sequential data (time series, text)
- Predicting future values in a series
- Building text generation models
- Classifying sequences (sentiment, activity)
- Understanding RNN/LSTM/GRU architectures
Core Patterns
Pattern 1: RNN vs LSTM vs GRU
| Model | Strengths | Weaknesses |
|---|---|---|
| RNN | Simple, fast | Vanishing gradients, short memory |
| LSTM | Long-term memory, gating | More parameters, slower |
| GRU | Simpler than LSTM, good memory | Less expressive than LSTM |
# RNN
rnn = nn.RNN(input_size=10, hidden_size=64, num_layers=2, batch_first=True)
# LSTM (recommended for most cases)
lstm = nn.LSTM(input_size=10, hidden_size=64, num_layers=2, batch_first=True)
# GRU (simpler alternative to LSTM)
gru = nn.GRU(input_size=10, hidden_size=64, num_layers=2, batch_first=True)Pattern 2: LSTM Shapes (CRITICAL!)
# Input: [batch, seq_len, input_size]
# With batch_first=True (recommended)
lstm = nn.LSTM(
input_size=10, # Features per timestep
hidden_size=64, # Hidden state size
num_layers=2, # Stacked LSTM layers
batch_first=True, # [batch, seq, features] format
dropout=0.2 # Between LSTM layers (if num_layers > 1)
)
# Forward pass
X = torch.randn(32, 100, 10) # [batch, seq_len, input_size]
output, (h_n, c_n) = lstm(X)
# Outputs:
# output: [32, 100, 64] - All hidden states
# h_n: [2, 32, 64] - Final hidden state (per layer)
# c_n: [2, 32, 64] - Final cell state (per layer)Pattern 3: Time Series Sliding Windows
def create_sequences(data, seq_length, forecast_horizon=1):
"""Create sliding window sequences for time series."""
X, y = [], []
for i in range(len(data) - seq_length - forecast_horizon + 1):
X.append(data[i:i+seq_length])
y.append(data[i+seq_length:i+seq_length+forecast_horizon])
return np.array(X), np.array(y)
# Usage
data = df['value'].values
seq_length = 30 # Use 30 timesteps to predict
forecast_horizon = 7 # Predict next 7 timesteps
X, y = create_sequences(data, seq_length, forecast_horizon)
# X shape: [num_samples, 30, 1]
# y shape: [num_samples, 7]Pattern 4: Time Series with Multiple Features
class MultiFeatureLSTM(nn.Module):
def __init__(self, num_features, hidden_size, num_layers, output_size):
super().__init__()
self.lstm = nn.LSTM(num_features, hidden_size, num_layers,
batch_first=True, dropout=0.2)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# x: [batch, seq_len, num_features]
lstm_out, _ = self.lstm(x)
# Use last timestep output
out = self.fc(lstm_out[:, -1, :])
return out
# Example with multiple features
# Features: [temperature, humidity, pressure, hour_sin, hour_cos, is_weekend]
model = MultiFeatureLSTM(num_features=6, hidden_size=128, num_layers=2, output_size=1)Pattern 5: Text Classification with LSTM
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True,
bidirectional=True)
self.fc = nn.Linear(hidden_size * 2, num_classes) # *2 for bidirectional
def forward(self, x):
# x: [batch, seq_len] - token indices
embedded = self.embedding(x) # [batch, seq_len, embed_dim]
lstm_out, (h_n, _) = self.lstm(embedded)
# Concatenate forward and backward final hidden states
hidden = torch.cat((h_n[-2], h_n[-1]), dim=1) # [batch, hidden*2]
out = self.fc(hidden)
return outPattern 6: Language Model (Text Generation)
class LanguageModel(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_size, num_layers):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden=None):
embedded = self.embedding(x)
output, hidden = self.lstm(embedded, hidden)
logits = self.fc(output)
return logits, hidden
def generate_text(model, seed_text, tokenizer, max_length=100, temperature=1.0):
"""Generate text autoregressively."""
model.eval()
tokens = tokenizer.encode(seed_text)
input_ids = torch.tensor([tokens])
hidden = None
generated = tokens.copy()
with torch.no_grad():
for _ in range(max_length):
logits, hidden = model(input_ids, hidden)
logits = logits[:, -1, :] / temperature
# Sample from distribution
probs = torch.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1).item()
generated.append(next_token)
input_ids = torch.tensor([[next_token]])
if next_token == tokenizer.eos_token_id:
break
return tokenizer.decode(generated)Pattern 7: Time Series Train/Val/Test Split
# ⚠️ CRITICAL: No shuffle for time series!
# Must maintain temporal order
def time_series_split(data, train_ratio=0.7, val_ratio=0.15):
"""Split time series maintaining order with optional gap."""
n = len(data)
train_end = int(n * train_ratio)
val_end = int(n * (train_ratio + val_ratio))
train = data[:train_end]
val = data[train_end:val_end]
test = data[val_end:]
return train, val, test
# With gap to prevent leakage
def time_series_split_with_gap(data, train_ratio=0.7, val_ratio=0.15, gap=7):
"""Add gap between splits to prevent data leakage."""
n = len(data)
train_end = int(n * train_ratio)
val_start = train_end + gap
val_end = int(n * (train_ratio + val_ratio))
test_start = val_end + gap
train = data[:train_end]
val = data[val_start:val_end]
test = data[test_start:]
return train, val, testPattern 8: Stacked LSTM with Dropout
class StackedLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
super().__init__()
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout # Applied between LSTM layers
)
self.dropout = nn.Dropout(dropout) # Before final FC
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
lstm_out, _ = self.lstm(x)
out = self.dropout(lstm_out[:, -1, :]) # Last timestep
out = self.fc(out)
return outReference Navigation
For detailed content, see:
- RNN Fundamentals:
reference/rnn_fundamentals.md- RNN formula, Hidden state, Vanishing gradients - LSTM/GRU:
reference/lstm_gru.md- Architecture, Gates, Time series patterns - Text Generation:
reference/text_generation.md- Language models, Temperature sampling - Sequence Classification:
reference/sequence_classification.md- Text classification with LSTM
Common Mistakes to Avoid
1. Shuffling Time Series Data
# WRONG: DataLoader shuffles by default
train_loader = DataLoader(dataset, batch_size=32, shuffle=True) # Bad!
# CORRECT: No shuffle for time series
train_loader = DataLoader(dataset, batch_size=32, shuffle=False)2. Fitting Scaler on All Data
# WRONG: Data leakage
scaler = StandardScaler()
data_scaled = scaler.fit_transform(all_data) # Sees future!
# CORRECT: Fit on train only
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_data)
val_scaled = scaler.transform(val_data)
test_scaled = scaler.transform(test_data)3. Wrong LSTM Output Shape
# LSTM output: [batch, seq_len, hidden_size]
# h_n: [num_layers, batch, hidden_size]
# WRONG: Using wrong dimension
output, (h_n, c_n) = lstm(x)
final_hidden = h_n[0] # First layer, not last!
# CORRECT: Use last layer
final_hidden = h_n[-1] # Last layer's hidden state4. Forgetting batch_first=True
# WRONG: Default is batch_first=False
lstm = nn.LSTM(10, 64)
X = torch.randn(32, 100, 10) # [batch, seq, features]
output, _ = lstm(X) # Shape mismatch!
# CORRECT: Set batch_first=True
lstm = nn.LSTM(10, 64, batch_first=True)5. Not Detaching Hidden State
# For very long sequences, detach to prevent memory issues
# WRONG: Memory grows indefinitely
hidden = None
for batch in data_loader:
output, hidden = model(batch, hidden) # Keeps full history!
# CORRECT: Detach between batches
hidden = None
for batch in data_loader:
if hidden is not None:
hidden = (hidden[0].detach(), hidden[1].detach())
output, hidden = model(batch, hidden)Teaching Mode
When explaining sequence models:
RNN Unrolled
Time: t-2 t-1 t t+1
│ │ │ │
▼ ▼ ▼ ▼
Input: x_{t-2} x_{t-1} x_t x_{t+1}
│ │ │ │
▼ ▼ ▼ ▼
┌───┐ ┌───┐ ┌───┐ ┌───┐
│ h │───►│ h │───►│ h │───►│ h │
└───┘ └───┘ └───┘ └───┘
│ │ │ │
▼ ▼ ▼ ▼
Output: y_{t-2} y_{t-1} y_t y_{t+1}
Hidden state h carries information forward through timeLSTM Gates Analogy
LSTM = "Memory with bouncers at the door"
Forget Gate: "Should I throw out old memories?"
Input Gate: "Should I let new info in?"
Output Gate: "What should I tell the next layer?"
The cell state is the "long-term memory"
The hidden state is the "working memory"Vanishing Gradients
Problem with vanilla RNN:
Gradient: 1.0 → 0.5 → 0.25 → 0.125 → 0.0625 → ~0
Layer 1 2 3 4 5
Each layer multiplies gradient by <1
After many layers, gradient vanishes!
Network can't learn long-term dependencies.
LSTM solution: "Highway" for gradients through cell state