Skip to content

core/ Quick Reference

Primitives for trait vector extraction and analysis.


Types

from core import VectorSpec, VectorResult, JudgeResult, ProjectionConfig, ModelVariant
from core.types import ActivationMetadata, ProjectionEntry, ProjectionRecord, SteeringRunRecord, SteeringResults

# Identify a single trait vector
spec = VectorSpec(
    layer=9,
    component='residual',      # residual, attn_contribution, mlp_contribution, etc.
    position='response[:]',    # extraction position
    method='probe',            # probe, mean_diff, gradient
    weight=0.9                 # coefficient for steering, relative weight for projection
)

# Single vector config
config = ProjectionConfig.single(9, 'residual', 'response[:]', 'probe', weight=0.9)

# Ensemble (multiple weighted vectors)
config = ProjectionConfig(vectors=[
    VectorSpec(9, 'attn_contribution', 'response[:]', 'probe', 0.5),
    VectorSpec(12, 'residual', 'response[:]', 'probe', 0.3),
    VectorSpec(15, 'residual', 'response[:]', 'probe', 0.2),
])

# Normalized weights (for projection, sum to 1.0)
weights = config.normalized_weights  # [0.5, 0.3, 0.2]

# Serialization
d = spec.to_dict()
spec = VectorSpec.from_dict(d)

# Vector selection returns VectorResult
from utils.vector_selection import select_vector, select_vectors
best = select_vector(experiment, trait)       # VectorResult — walks the validation hierarchy
top = select_vectors(experiment, trait, n=3)  # List[VectorResult]
spec = best.to_vector_spec(weight=1.0)        # Convert to VectorSpec
print(best.delta, best.coherence)             # populated only when best.source == 'steering'

# Model variant from experiment config
from utils.paths import get_model_variant
variant = get_model_variant(experiment)  # ModelVariant(name, model, lora)

# Judge scoring returns JudgeResult
from utils.metrics import summarize_judge_scores
result = summarize_judge_scores(scores)  # JudgeResult(trait_mean, coherence_mean, n, ...)

# Activation metadata (written by extract_vectors.py, read by load_activations.py)
from utils.load_activations import load_activation_metadata
meta = load_activation_metadata(experiment, trait, variant)  # ActivationMetadata
meta.n_layers, meta.hidden_dim, meta.captured_layers, meta.activation_norms

Hooks

from core import CaptureHook, MultiLayerCapture, SteeringHook, AblationHook, get_hook_path, detect_contribution_paths

# Capture from one layer
with CaptureHook(model, "model.layers.16") as hook:
    model(**inputs)
activations = hook.get()  # [batch, seq, hidden]

# Capture from multiple layers
with MultiLayerCapture(model, layers=[14, 15, 16]) as capture:
    model(**inputs)
acts = capture.get(16)
all_acts = capture.get_all()  # {14: tensor, 15: tensor, 16: tensor}

# Capture all layers
with MultiLayerCapture(model) as capture:  # layers=None = all
    model(**inputs)

# Convenience wrapper: batch capture at a position-DSL slice, across prompts
from utils.capture_activations import capture_at_position
acts = capture_at_position(
    model, tokenizer, prompts,
    layers=49, position='prompt[-1]', pool='last',
)  # [n_prompts, hidden_dim]
# layers=int → squeezed, layers=[list] → [n_prompts, n_layers, hidden_dim]
# position DSL strings: 'prompt[-1]', 'prompt[-3:]', 'all[:]' (prefill-only, no response frame)
# pool: 'mean'|'first'|'last'|'none'; pre_formatted=True skips format_prompt
# Correctly handles left-pad offsets; fp32 CPU return.

# Steer generation (add vector to output)
vector = torch.load('vectors/probe_layer16.pt')
with SteeringHook(model, vector, "model.layers.16", coefficient=1.5):
    output = model.generate(**inputs)

# Ablate direction (project out vector from output)
# Implements x' = x - (x · r̂) * r̂
with AblationHook(model, direction, "model.layers.16"):
    output = model.generate(**inputs)

# Path helper (layer + component -> string)
get_hook_path(16)                    # "model.layers.16" (residual)
get_hook_path(16, "attn_contribution", model=model)
# Gemma-2: "model.layers.16.post_attention_layernorm"
# Llama:   "model.layers.16.self_attn.o_proj"

# Components: residual, attn_contribution*, mlp_contribution*, k_proj, v_proj
# *contribution components require model parameter (auto-detect architecture)

Architecture detection:

from core import detect_contribution_paths

paths = detect_contribution_paths(model)
# Gemma-2: {'attn_contribution': 'post_attention_layernorm', 'mlp_contribution': 'post_feedforward_layernorm'}
# Llama/Mistral/Qwen: {'attn_contribution': 'self_attn.o_proj', 'mlp_contribution': 'mlp.down_proj'}
# Unknown architecture: raises ValueError with diagnostic info

Projection hooks (used by inference pipeline for on-GPU projection):

from core import ProjectionHook, MultiLayerProjection

# Project activations onto a vector inside the hook (no PCIe transfer)
with ProjectionHook(model, vector, "model.layers.16") as hook:
    model(**inputs)
scores = hook.get()  # [batch, seq] — already projected

# Multi-layer projection (used by inference pipeline)
with MultiLayerProjection(model, vectors_by_layer={14: vec14, 16: vec16}) as proj:
    model(**inputs)
scores = proj.get(16)  # [batch, seq]

Multi-layer steering (evaluate multiple layers in one forward pass):

from core import MultiLayerSteering

with MultiLayerSteering(model, configs=[(layer, vector, coef) for ...]):
    output = model.generate(**inputs)

Per-position steering (steer only at specific token positions):

from core import PerPositionSteeringHook

# Steer only tokens in range [12, 18)
with PerPositionSteeringHook(model, vector, "model.layers.16", coefficient=1.5, token_range=(12, 18)):
    output = model.generate(**inputs)

Activation capping (clamp activations along a direction):

from core import ActivationCappingHook
# h ← h + max(0, τ - ⟨h,v̂⟩)·v̂
with ActivationCappingHook(model, vector, "model.layers.16", threshold=0.5):
    output = model.generate(**inputs)

Validation: Hooks fail fast on invalid inputs: - SteeringHook / AblationHook: Reject non-1D vectors - AblationHook: Reject zero or near-zero direction vectors - detect_contribution_paths: Raise ValueError for unrecognized architectures


Generation with Hooks

from core import HookedGenerator, CaptureConfig, SteeringConfig

gen = HookedGenerator(model)

# Batched generation with capture
results = gen.generate(
    input_ids, attention_mask,
    max_new_tokens=50,
    capture=CaptureConfig(layers=[14, 15], components=['residual']),
)
# results[0].token_ids: generated tokens
# results[0].activations[14]['residual']: [n_tokens, hidden_dim]

# Streaming for UI (single sample)
for tok in gen.stream(input_ids, attention_mask, capture=CaptureConfig(layers=[14])):
    print(tok.token_id, tok.activations[14]['residual'].shape)

# Generation with steering
steering = [SteeringConfig(vector=v, layer=14, coefficient=1.5)]
results = gen.generate(input_ids, attention_mask, steering=steering)

Key features: - KV caching for O(n) generation (not O(n²)) - Clean 1:1 token→activation mapping (no skip-first bug) - Supports batching, streaming, capture, and steering


Extraction Methods

from core import get_method

method = get_method('probe')  # or 'mean_diff', 'gradient', 'random_baseline', 'rfm'
result = method.extract(pos_acts, neg_acts)
vector = result['vector']

Available methods (all return unit-normalized vectors): - mean_diff - Baseline: vector = mean(pos) - mean(neg), then normalized - probe - Logistic regression on row-normalized activations, then normalized - gradient - Gradient optimization to maximize separation, normalized - random_baseline - Random unit vector (sanity check, ~50% accuracy) - rfm - Top eigenvector of AGOP matrix (grid searches bandwidth × center_grads, selects by AUC on val split)

Note: All vectors are unit-normalized for consistent steering coefficients across models. Probe uses row normalization (each sample scaled to unit norm) so LogReg coefficients are ~1 magnitude regardless of model activation scale.

Precision: Activations are stored as float16 (50% space savings). All extraction methods upcast to float32 before computation — gradient descent and epsilon values (1e-8) require it. Probe upcasts via sklearn (internally float64).


Math Functions

from core import projection, batch_cosine_similarity, cosine_similarity, orthogonalize
from core import pairwise_cosine_matrix, pca, project_out_subspace

# Project activations onto vector (normalizes vector only)
scores = projection(activations, trait_vector)  # [n_samples]

# Cosine similarity (normalizes both activations and vector)
scores = batch_cosine_similarity(activations, trait_vector)  # [n_samples] in [-1, 1]

# Compare two vectors
similarity = cosine_similarity(refusal_vec, evil_vec)  # scalar in [-1, 1]

# Remove one vector's component from another
clean_vec = orthogonalize(trait_vector, confound_vector)

# N×N cosine similarity matrix for a set of vectors
sim_matrix = pairwise_cosine_matrix(vectors)  # [N, N]

# PCA via SVD
components, var_ratio, projections = pca(vectors, n_components=10)

# Remove a subspace from vectors (generalizes orthogonalize to multiple directions)
cleaned = project_out_subspace(vectors, basis)  # basis: [K, hidden_dim]

# Cross-trait normalization (grand mean subtraction + neutral PC denoising)
from core.math import grand_mean_center, compute_top_pcs_by_variance, denoise_with_pcs
centered, grand_mean = grand_mean_center(vectors_dict)  # {name: tensor} -> {name: centered}
basis, var_ratio, n_pcs = compute_top_pcs_by_variance(neutral_acts, variance_threshold=0.5)
denoised = denoise_with_pcs(centered, basis)  # project out neutral PCs

# Geometry analysis
from core.math import trait_clusters, representational_similarity, pca_norm_correlation, vector_set_comparison
labels, centroids, inertia = trait_clusters(vectors, k=10)
rsa_matrix, layers = representational_similarity(vectors_by_layer)  # {layer: [N, D]} -> [L, L]
corr = pca_norm_correlation(projections, names, norms)  # PC1/2 vs human valence/arousal
comparison = vector_set_comparison(vecs_a, vecs_b)  # cross-set cosine + orthogonalization

Metrics (operate on projection scores):

from core import accuracy, effect_size, polarity_correct

# First compute projections
pos_proj = batch_cosine_similarity(pos_acts, vector)
neg_proj = batch_cosine_similarity(neg_acts, vector)

# Then compute metrics
acc = accuracy(pos_proj, neg_proj)                    # 0.0 to 1.0
d = effect_size(pos_proj, neg_proj)                   # 0.2=small, 0.5=medium, 0.8=large
d = effect_size(pos_proj, neg_proj, signed=True)      # Preserve sign (pos > neg = positive)
ok = polarity_correct(pos_proj, neg_proj)             # True if pos_mean > neg_mean


Massive Activations

Certain dimensions have values 100-1000x larger than median (Sun et al. 2024). These create fixed biases in projections.

Calibration: Happens passively during the first inference run — hooks capture residual-stream activations on prefill up to ~5000 tokens, write to experiments/{exp}/inference/{model_variant}/massive_activations/calibration.json, self-remove. Subsequent runs skip. See utils/massive_dims.py:MassiveDimCollector.

Advanced — use curated neutral prompts (50 Alpaca-style prompts) instead of whatever inference ran on:

python analysis/vectors/massive_activations.py --experiment gemma-2-2b

Research mode: Analyze a specific prompt set:

python analysis/vectors/massive_activations.py --experiment gemma-2-2b --prompt-set jailbreak_subset --per-token

Visualization: The Trait Dynamics view has a "Clean" dropdown with options: - "No cleaning" — Raw projections - "Top 5, 3+ layers" — Dims in top-5 at 3+ layers (recommended) - "All candidates" — All massive dims


GPU Profiling

from utils.vram import gpu_profile, memory_stats, find_cuda_tensors

# Profile a code block (synchronize-bracketed timing + memory)
with gpu_profile("forward pass"):
    model(**inputs)
# Prints: [forward pass] 0.45s | peak 12.3GB | delta +2.1GB

# Memory snapshot
stats = memory_stats()
# {'allocated': 5.2, 'reserved': 8.0, 'free': 40.0, 'total': 50.8}

# Diagnose leaked tensors after cleanup
leaked = find_cuda_tensors()
# [(torch.Size([64, 300, 2304]), torch.bfloat16, 'cuda:0', 88.47), ...]

Helpers:

from utils.vram import bandwidth_report, tensor_size_gb

bandwidth_report(data_gb=4.6, elapsed=0.19)  # "4.6GB in 0.19s = 24.2 GB/s"
tensor_size_gb((64, 300, 2304))              # 0.089 (for bfloat16)


Generation Backend

Unified interface for generation with steering and activation capture. Abstracts local vs remote inference.

from utils.backends import LocalBackend, get_backend, GenerationConfig, SteeringSpec, CaptureSpec

# From experiment config (auto-selects variant, respects use_chat_template config)
backend = LocalBackend.from_experiment("gemma-2-2b", variant="instruct")

# From already-loaded model
backend = LocalBackend.from_model(model, tokenizer)

# With explicit chat template override (useful for base models)
backend = LocalBackend.from_model(model, tokenizer, use_chat_template=False)

# Auto-select server vs local (prefers server if running)
backend = get_backend(experiment="gemma-2-2b", prefer_server=True)

Chat template resolution (3-level fallback): 1. Explicit use_chat_template parameter 2. Experiment config use_chat_template setting 3. Auto-detect from tokenizer.chat_template is not None

Properties:

backend.n_layers      # Number of transformer layers
backend.hidden_dim    # Hidden dimension size
backend.device        # Model device (torch.device)
backend.model         # Access underlying model (for hooks)
backend.tokenizer     # Access tokenizer (for formatting)

Generation:

# Simple generation
responses = backend.generate(["Hello, how are you?"])

# With configuration
config = GenerationConfig(max_new_tokens=256, temperature=0.7)
responses = backend.generate(prompts, config=config)

# With steering
steering = [SteeringSpec(layer=16, vector=trait_vec, coefficient=1.5)]
responses = backend.generate(prompts, steering=steering)

Generation with capture:

capture = CaptureSpec(layers=[14, 15, 16], components=['residual'])
results = backend.generate_with_capture(prompts, capture=capture)
# results[0].prompt_activations[14]['residual'] -> [n_tokens, hidden_dim]
# results[0].response_activations[14]['residual'] -> [n_tokens, hidden_dim]

Streaming (for chat UIs):

for token in backend.stream(prompt, capture=CaptureSpec(layers=[14])):
    print(token.token, token.activations)

Forward pass with capture (no generation):

activations = backend.forward_with_capture(input_ids, attention_mask, capture)
# activations[layer][component] -> [batch, seq, hidden]

vLLM backend (high-throughput generation, no hooks):

For bulk text generation (stories, dialogues, rollouts) where you don't need activation capture or steering. Uses vLLM's continuous batching for ~5-10x throughput vs HF.

from utils.backends import VLLMBackend

# Initialize (auto-detects AWQ/GPTQ from model ID)
engine = VLLMBackend("hugging-quants/Meta-Llama-3.3-70B-Instruct-AWQ-INT4", seed=42)
responses = engine.generate(formatted_prompts, max_new_tokens=256, temperature=0.7)

# Free GPU before loading HF model for extraction
engine.shutdown()

Constraints: - No steering hooks (vLLM doesn't expose model internals) - No activation capture - Seeds are not cross-backend identical (same seed ≠ same output between HF and vLLM) - Requires uv pip install vllm (not in default deps)

Escape hatch (for complex hooks):

For operations requiring direct model access (e.g., PerSampleSteering, benchmark logit scoring):

# Use backend.model and backend.tokenizer directly
model = backend.model
tokenizer = backend.tokenizer

# Example: batched steering with different coefficients per batch slice
from core import PerSampleSteering

steering_configs = [(layer, vector, coef, (start, end)) for ...]
with PerSampleSteering(model, steering_configs, component='residual'):
    responses = generate_batch(model, tokenizer, prompts, max_new_tokens=256)

Files

core/
├── __init__.py      # Public API exports
├── types.py         # VectorSpec, VectorResult, JudgeResult, ProjectionConfig, ModelVariant, SteeringEntry, ResponseRecord, ProjectionEntry, ProjectionRecord, SteeringRunRecord, SteeringResults, ActivationMetadata, ModelConfig
├── hooks.py         # CaptureHook, SteeringHook, PerPositionSteeringHook, ProjectionHook, MultiLayerCapture, MultiLayerProjection, ...
├── methods.py       # Extraction methods (probe, mean_diff, gradient)
├── math.py          # projection, cosine_similarity, batch_cosine_similarity, pairwise_cosine_matrix, pca, project_out_subspace, grand_mean_center, compute_top_pcs_by_variance, denoise_with_pcs, trait_clusters, representational_similarity, vector_set_comparison, pca_norm_correlation, accuracy, effect_size, pearson_correlation
└── generation.py    # HookedGenerator for generation with capture/steering