Text processing, sentiment analysis, LLMs, and NLP frameworks. Use for text classification, named entity recognition, or language models.
Resources
3Install
npx skillscat add pluginagentmarketplace/custom-plugin-ai-data-scientist/nlp-processing Install via the SkillsCat registry.
SKILL.md
Natural Language Processing
Process, analyze, and understand text data with modern NLP techniques.
Quick Start
Text Preprocessing
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def preprocess_text(text):
# Lowercase
text = text.lower()
# Remove special characters
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if w not in stop_words]
# Lemmatize
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(w) for w in tokens]
return ' '.join(tokens)Sentiment Analysis
from transformers import pipeline
# Pre-trained model
sentiment_analyzer = pipeline("sentiment-analysis")
result = sentiment_analyzer("I love this product!")
# [{'label': 'POSITIVE', 'score': 0.9998}]
# Custom model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(documents)
model = LogisticRegression()
model.fit(X, labels)TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2), # Unigrams and bigrams
min_df=2, # Minimum document frequency
max_df=0.8 # Maximum document frequency
)
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()Named Entity Recognition
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple Inc. was founded by Steve Jobs in California.")
for ent in doc.ents:
print(f"{ent.text}: {ent.label_}")
# Apple Inc.: ORG
# Steve Jobs: PERSON
# California: GPEBERT for Text Classification
from transformers import (
BertTokenizer, BertForSequenceClassification,
Trainer, TrainingArguments
)
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
'bert-base-uncased',
num_labels=2
)
# Tokenize
def tokenize_function(examples):
return tokenizer(
examples['text'],
padding='max_length',
truncation=True,
max_length=128
)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Train
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
evaluation_strategy='epoch'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test']
)
trainer.train()Text Generation with GPT
from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
input_text = "The future of AI is"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(
input_ids,
max_length=50,
num_return_sequences=1,
temperature=0.7,
top_k=50,
top_p=0.95
)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)Topic Modeling with LDA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1000, max_df=0.8, min_df=2)
X = vectorizer.fit_transform(documents)
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)
# Display topics
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
top_words = [feature_names[i] for i in topic.argsort()[-10:]]
print(f"Topic {topic_idx}: {', '.join(top_words)}")Word Embeddings
from gensim.models import Word2Vec
# Train Word2Vec
sentences = [word_tokenize(doc) for doc in documents]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
# Get vector
vector = model.wv['king']
# Find similar words
similar = model.wv.most_similar('king', topn=5)Common Tasks
Text Classification:
- Sentiment analysis
- Spam detection
- Intent classification
- Topic categorization
Sequence Labeling:
- Named Entity Recognition (NER)
- Part-of-Speech (POS) tagging
- Keyword extraction
Generation:
- Text summarization
- Machine translation
- Chatbots
- Code generation
Best Practices
- Clean text (remove noise, normalize)
- Handle class imbalance
- Use pre-trained models when possible
- Fine-tune on domain-specific data
- Validate with diverse test data
- Monitor for bias and fairness