{/* Top activating proteins — 3D structure + activation-shaded sequence
+ per-method confusion (geom from concordance_labels, MEME/Position computed) */}
{/* Geometric profile (radar) + GBM importance */}
{/* Activation bins — every protein renders the full ProteinEntry card
(3D + sequence + MEME / Position confusion). Geom confusion is
unavailable beyond the top 5 (concordance_labels aren't stored
for bin proteins) so the geom row gracefully shows a note. */}
{/* NMPFams metagenomic hits — same structure as Swiss-Prot panel,
but only the Geometric confusion row (no bio annotations exist
for unannotated metagenomic sequences). */}
);
}
// ────────────────────────────────────────────────────────────────────────
// Per-residue method prediction utilities
//
// All operate on a single protein given its sequence and the SAE truth mask
// (act > 0). They return a length-N array of booleans (1 = predicted).
// ────────────────────────────────────────────────────────────────────────
// All predicate names emitted by the position-enrichment pipeline. Each is a
// boolean test (i, n) -> bool that says whether residue index i (0-based) is
// inside the predicate's region of an n-residue protein.
const POSITION_PREDICATES = (() => {
const pct = (lo, hi) => (i, n) => {
const f = i / Math.max(n - 1, 1);
return f >= lo && f < hi;
};
const third = (which) => (i, n) => {
const f = i / Math.max(n - 1, 1);
return which === "N" ? f < 1/3 : which === "C" ? f >= 2/3 : (f >= 1/3 && f < 2/3);
};
// first_ / last_ are absolute counts (residue index, not percentage).
const firstN = (k) => (i, n) => i < k;
const lastN = (k) => (i, n) => i >= n - k;
// Named ranges used by the pipeline.
const interior80 = (i, n) => { const f = i / Math.max(n - 1, 1); return f >= 0.1 && f < 0.9; };
const terminal10 = (i, n) => { const f = i / Math.max(n - 1, 1); return f < 0.1 || f >= 0.9; };
const mid20 = (i, n) => { const f = i / Math.max(n - 1, 1); return f >= 0.4 && f < 0.6; };
return {
pct_0_10: pct(0.0, 0.1),
pct_10_20: pct(0.1, 0.2),
pct_20_30: pct(0.2, 0.3),
pct_30_40: pct(0.3, 0.4),
pct_40_50: pct(0.4, 0.5),
pct_50_60: pct(0.5, 0.6),
pct_60_70: pct(0.6, 0.7),
pct_70_80: pct(0.7, 0.8),
pct_80_90: pct(0.8, 0.9),
pct_90_100: pct(0.9, 1.001),
third_N: third("N"),
third_M: third("M"),
third_C: third("C"),
first_5: firstN(5),
first_10: firstN(10),
first_20: firstN(20),
last_10: lastN(10),
last_20: lastN(20),
interior_80pct: interior80,
terminal_10pct: terminal10,
mid_20pct: mid20,
};
})();
function predictPosition(predicateName, length) {
const f = POSITION_PREDICATES[predicateName];
if (!f || !length) return null;
const out = new Array(length);
for (let i = 0; i < length; i++) out[i] = f(i, length) ? 1 : 0;
return out;
}
// Score the PWM at every position (sliding window). Mark every residue inside
// a window whose log-likelihood ratio >= threshold as predicted=1.
//
// `pwm` rows are *probabilities* in `aa_order` (rows sum to 1, exactly as the
// pipeline writes them in motif_pwm_enrichment.motifs[].pwm). The background
// is uniform over 20 letters; the score is the standard MEME log-odds:
//
// score = sum_k log( pwm[k][a] / (1/20) ) = sum_k ( log(pwm[k][a]) + log(20) )
//
// Zero-probability cells get a small pseudocount so log doesn't blow up.
function predictMEME(seq, pwm, aaOrder, threshold) {
if (!seq || !pwm || !pwm.length || threshold == null) return null;
const w = pwm.length;
const n = seq.length;
const aaIdx = {};
for (let i = 0; i < aaOrder.length; i++) aaIdx[aaOrder[i]] = i;
const logBg = Math.log(1 / aaOrder.length);
const eps = 1e-6; // pseudocount for zero-probability cells
const out = new Array(n).fill(0);
for (let p = 0; p + w <= n; p++) {
let s = 0;
let valid = true;
for (let k = 0; k < w; k++) {
const ix = aaIdx[seq[p + k]];
if (ix == null) { valid = false; break; }
const prob = pwm[k][ix];
s += Math.log(prob + eps) - logBg;
}
if (valid && s >= threshold) {
for (let k = 0; k < w; k++) out[p + k] = 1;
}
}
return out;
}
function confusion(truth, pred) {
if (!truth || !pred || !truth.length || truth.length !== pred.length) return null;
const labels = new Array(truth.length);
let tp = 0, fp = 0, fn = 0, tn = 0;
for (let i = 0; i < truth.length; i++) {
const t = truth[i] ? 1 : 0;
const p = pred[i] ? 1 : 0;
let lab;
if (t && p) { lab = "tp"; tp++; }
else if (!t && p) { lab = "fp"; fp++; }
else if (t && !p) { lab = "fn"; fn++; }
else { lab = "tn"; tn++; }
labels[i] = lab;
}
const total = tp + fp + fn + tn;
return { labels, tp, fp, fn, tn, agree_pct: total ? 100 * (tp + tn) / total : 0 };
}
// Pretty-print agree pct so 199/200 reads as "99.5", not "100". Anything with
// any disagreement floors below 100 so the value never lies; perfect agreement
// reports as "100".
function fmtAgreePct(conf) {
if (!conf) return "—";
const total = conf.tp + conf.fp + conf.fn + conf.tn;
if (!total) return "—";
const disagreement = conf.fp + conf.fn;
if (disagreement === 0) return "100";
// Always strictly below 100 — use a 0.05% floor and 1 decimal place.
const raw = (100 * (conf.tp + conf.tn)) / total;
const capped = Math.min(raw, 99.95);
return capped.toFixed(1);
}
// Per-residue colour map for the 3D structure: pure white → red activation
// intensity. The same ramp mol_viewer.js uses for Swiss-Prot — applied
// consistently to NMPFams cartoons too, so the backbone never gets class
// colours. Confusion classes belong on the strip below, not on the structure.
function buildActivationColorMap(acts, maxAct) {
const map = {};
if (!acts) return map;
for (let i = 0; i < acts.length; i++) {
const norm = maxAct > 0 ? Math.min((acts[i] || 0) / maxAct, 1) : 0;
const r = 255;
const g = Math.round(255 - norm * (255 - 38));
const b = Math.round(255 - norm * (255 - 38));
map[i + 1] = (r << 16) | (g << 8) | b;
}
return map;
}
// Initialise a 3Dmol viewer for the given PDB URL, colouring residues by an
// arbitrary residue-number → hex map (e.g. TP/FP/FN/TN palette). Container
// must have non-zero size at call time.
async function init3DViewerWithMap(container, pdbUrl, colorMap) {
if (!container || !window.$3Dmol) return;
try {
const r = await fetch(pdbUrl);
if (!r.ok) {
container.innerHTML = '
No per-protein geometry plot available for this feature.
);
}
return (
Top activating proteins · structure · per-method residue agreement
{tps.length} protein{tps.length === 1 ? "" : "s"} ·
activation in orange ·
per-method confusion strips (TP / FP / FN / TN) · hover a residue to align across rows
{tps.map((p, i) => (
))}
);
}
function ProteinEntry({ protein, ctx }) {
const { feat, featureMaxAct, saeThreshold, motif, posPredicate, posF1 } = ctx;
const containerRef = React.useRef(null);
const [hover, setHover] = React.useState(null);
const seq = protein.sequence || "";
// Two possible schemas for the activation source:
// - geometry_enrichment.plot_data.top_proteins: `sae_activation_profile`
// - features.{fid}.activation_bins[*]: `per_residue_activations`
const acts = protein.sae_activation_profile || protein.per_residue_activations || [];
const maxAct = Math.max(...acts, featureMaxAct || 0.001);
// Truth mask matches the pipeline's truth: residues whose raw SAE activation
// exceeds the per-feature SAE threshold (geometric_residue_level.activation_threshold).
// This is what concordance_labels in geometry_enrichment is computed against,
// so the MEME / Position rows here use the same truth as the geom row above.
const truthMask = React.useMemo(
() => acts.map(a => (a > saeThreshold ? 1 : 0)),
[acts, saeThreshold]
);
// Geom confusion — only available when the pipeline pre-computed concordance
// labels (i.e. for the top 5 proteins in plot_data.top_proteins). Activation
// bin / arbitrary proteins won't have it — null falls through to the row's
// graceful "concordance not stored" message.
const geomConf = React.useMemo(
() => protein.concordance_labels && protein.concordance_labels.length
? confusionFromLabels(protein.concordance_labels)
: null,
[protein.concordance_labels]
);
// MEME confusion — derive from PWM scan.
const memeConf = React.useMemo(() => {
const pred = motif.pwm
? predictMEME(seq, motif.pwm, motif.aaOrder, motif.threshold)
: null;
return pred ? confusion(truthMask, pred) : null;
}, [seq, motif, truthMask]);
// Position confusion.
const posConf = React.useMemo(() => {
const pred = posPredicate ? predictPosition(posPredicate, seq.length) : null;
return pred ? confusion(truthMask, pred) : null;
}, [posPredicate, seq, truthMask]);
// InterPro residue confusion. Lazy-fetch the cached InterPro domain spans
// for this protein (only present for layers that have an interpro_cache).
// The "prediction" is residues inside any domain whose interpro_accession
// matches the feature's top InterPro-residue annotation; if no exact match,
// fall back to "any domain" so the user still sees a strip.
const [iprDomains, setIprDomains] = React.useState(null);
React.useEffect(() => {
let cancelled = false;
if (!protein.accession) return;
fetch(`/api/interpro/${protein.accession}`)
.then(r => r.ok ? r.json() : null)
.then(d => { if (!cancelled) setIprDomains(d?.domains || []); })
.catch(() => { if (!cancelled) setIprDomains([]); });
return () => { cancelled = true; };
}, [protein.accession]);
const iprConf = React.useMemo(() => {
if (iprDomains == null) return null; // still loading
if (!iprDomains.length) return null; // genuinely no annotations
const targetCode = (feat?.m2_label || "").split(" ")[0]; // e.g. "IPR036291 NAD..."
const matches = targetCode
? iprDomains.filter(d => d.interpro_accession === targetCode)
: iprDomains;
if (!matches.length) return null;
const pred = new Array(seq.length).fill(0);
for (const d of matches) {
const start = Math.max(0, (d.start || 1) - 1);
const end = Math.min(seq.length, (d.end || seq.length));
for (let i = start; i < end; i++) pred[i] = 1;
}
return confusion(truthMask, pred);
}, [iprDomains, feat?.m2_label, seq.length, truthMask]);
// 3D viewer — lazy-init via IntersectionObserver. Colour map is the hybrid:
// activation orange ramp at TP/unlabeled residues, blue at FP, red at FN.
// For activation-bin proteins (no labels) this becomes pure activation
// intensity, equivalent to the original mol_viewer.js behaviour.
React.useEffect(() => {
if (!protein.accession || !containerRef.current) return;
let observer;
let initialized = false;
const init = () => {
if (initialized || !containerRef.current) return;
initialized = true;
const norm = featureMaxAct && featureMaxAct > 0 ? featureMaxAct : maxAct;
const colorMap = buildActivationColorMap(acts, norm);
init3DViewerWithMap(containerRef.current,
`/api/pdb/${protein.accession}`, colorMap);
};
if ("IntersectionObserver" in window) {
observer = new IntersectionObserver((entries) => {
for (const e of entries) {
if (e.isIntersecting) { init(); observer.disconnect(); break; }
}
}, { rootMargin: "120px" });
observer.observe(containerRef.current);
} else {
init();
}
return () => { if (observer) observer.disconnect(); };
}, [protein.accession]);
// Count of residues above the SAE truth threshold. The previous "active
// window N–M" reading was misleading because most positions in that range
// were quiet — what matters is the fraction of residues that fired.
const nActive = React.useMemo(
() => acts.reduce((s, a) => s + (a > saeThreshold ? 1 : 0), 0),
[acts, saeThreshold]
);
// Build a single shared sequence row + the four method rows so they all
// align at residue level. We render them in a horizontally-scrolling
// container so long proteins remain navigable.
return (
{protein.accession ? "3D viewer loads on scroll" : "no structure"}
3Dmol cartoon · /api/pdb/{protein.accession}
act 0
½ max
max act
{/* RIGHT — sequence with activation shading + per-method strips,
sharing a single horizontal scroll so AA position N always
aligns with column N in every confusion strip. */}
Max act{fmt(maxAct, 2)}
>}
methods={[
{
short: "Geometric", kind: "geom",
conf: geomConf, q: feat?.m7_q,
detail: geomConf
? `q ${fmtQ(feat?.m7_q)}`
: `concordance only pre-computed for top 5 proteins · q ${fmtQ(feat?.m7_q)}`,
},
{
short: "MEME motif", kind: "bio",
conf: memeConf, q: feat?.m6_q,
detail: motif.consensus
? `consensus ${motif.consensus} · thr ${fmt(motif.threshold, 2)}`
: "no PWM available",
},
{
short: "InterPro residue", kind: "bio",
conf: iprConf,
q: feat?.m2_q,
detail: iprDomains == null
? `${feat?.m2_label || "—"} · loading interpro cache…`
: !iprDomains.length
? `${feat?.m2_label || "—"} · no interpro annotations for this protein`
: !iprConf
? `${feat?.m2_label || "—"} · domain not in this protein's annotations`
: `${feat?.m2_label || "—"} · domains from interpro_cache`,
},
{
short: "Position", kind: "bio",
conf: posConf, q: feat?.m5_q,
detail: posPredicate
? `predicate "${posPredicate}" · global F1 ${fmt(posF1, 2)}`
: "no top predicate",
},
]}
/>
);
}
// ────────────────────────────────────────────────────────────────────────
// AlignedSequenceStack — single horizontally-scrolling block that renders
// the AA sequence and N method-confusion strips on a shared residue grid
// (every row uses the same fixed cell width). Labels stick to the left
// and stats stick to the right so the user sees what they're looking at
// even when scrolled deep into a long protein.
// ────────────────────────────────────────────────────────────────────────
const CELL_W = 14; // px per residue, shared across all rows
const NAME_W = 124; // sticky left column width
const STATS_W = 168; // sticky right column width
function AlignedSequenceStack({ seq, acts, maxAct, saeThreshold, hover, onHover, methods, rightStub }) {
const trackWidth = (seq?.length || 0) * CELL_W;
return (
Sequence + per-method residue agreement
{seq.length} aa · {CELL_W}px per residue · hover or scroll horizontally to align rows · truth = act > SAE threshold {fmt(saeThreshold, 2)}
onHover && onHover(null)}>
{/* AA row */}
SAE activation
{seq.split("").map((aa, i) => {
const a = acts[i] || 0;
const isHot = a > saeThreshold;
// Only orange-shade residues that are above the pipeline's truth
// threshold. Below-threshold residues are TN and shouldn't read
// as "activated" — that's what was making the user think a sub-
// threshold position should be FN.
const bg = isHot && maxAct > 0
? `color-mix(in oklch, var(--geom) ${Math.round(Math.min(a / maxAct, 1) * 80)}%, var(--paper))`
: "transparent";
return (
onHover && onHover(i)}
title={`pos ${i + 1} · ${aa} · act ${a.toFixed(3)}${isHot ? " (truth=1)" : " (truth=0)"}`}>
{aa}
);
})}
{rightStub}
{/* Method rows */}
{methods.map((m, i) => (
))}
);
}
function AlignedMethodRow({ m, hover, onHover, trackWidth }) {
const sigBool = isSig(m.q);
return (
);
}
// Per-protein method row. Stat column shows ONLY per-protein numbers
// (agreement %, residue counts, q-value). The feature-level F1/PR-AUC
// already lives in the methods grid at the top of the feature page,
// so we don't repeat it here.
function MethodRow({ m, conf, q, detail, hover, onHover }) {
const sigBool = isSig(q);
const accent = m.kind === "geom" ? "var(--geom)" : "var(--bio)";
return (
6-dim summary · normalised importance per category
{radar ? (
{Object.entries(radar).map(([k, v]) => (
{k}{(v ?? 0).toFixed(2)}
))}
) : (
No geometric profile available for this feature.
)}
GBM feature importance
Cα descriptor importances from the geometric classifier
{geomEnrich.loading
?
: importances
?
:
No importance vector available.
}
);
}
function GbmImportanceVector({ importances }) {
// importances is a dict of {feature_name: weight}. Sort descending and render
// the top-K as a horizontal bar list with a small label-totals strip on top.
const entries = Object.entries(importances)
.map(([k, v]) => [k, Number(v) || 0])
.filter(([, v]) => v > 0)
.sort((a, b) => b[1] - a[1]);
if (!entries.length) return
No non-zero importances.
;
const max = entries[0][1] || 1;
const SHOW = Math.min(entries.length, 16);
return (
{entries.slice(0, SHOW).map(([k, v], i) => (
{k}{(v * 100).toFixed(1)}%
))}
{entries.length > SHOW && (
+ {entries.length - SHOW} more descriptors below threshold
)}
);
}
// ────────────────────────────────────────────────────────────────────────
// Activation bins — click each to expand the proteins that activate in that bin.
// Real schema: activation_bins is a dict keyed by "0.0-0.25", ..., values are
// arrays of {accession, max_activation, sequence, per_residue_activations, ...}.
// ────────────────────────────────────────────────────────────────────────
function ActivationBins({ det, ctx }) {
const bins = (det && det.activation_bins) || {};
const order = ["0.75-1.0", "0.5-0.75", "0.25-0.5", "0.0-0.25"];
const [open, setOpen] = React.useState(null);
const items = order
.filter(k => Array.isArray(bins[k]))
.map(k => ({ key: k, label: k.replace("-", " – "), proteins: bins[k] }));
if (!items.length) return null;
return (
Activation bins
activating proteins grouped by max-activation range — click to expand
);
}
// ────────────────────────────────────────────────────────────────────────
// MetagenomicHits — NMPFams metagenomic protein clusters that activate
// this feature. Same per-residue confusion structure as Swiss-Prot, but
// only the Geometric row is meaningful (these sequences have no biological
// annotations by construction). Real data, no synthesis:
// - sequence, sae_activation_profile, geom_prob_profile, concordance_labels
// are pre-computed and stored in nmpfam_enrichment[fid].nmpfam_hits[i].
// ────────────────────────────────────────────────────────────────────────
function MetagenomicHits({ layer, featureId, feat }) {
const nmp = useFetch(() => API.nmpfam(layer, featureId).catch(() => null), [layer, featureId]);
const [open, setOpen] = React.useState(false);
const [tier, setTier] = React.useState("triple");
if (nmp.loading) return null;
// Tolerate 404s — many features simply have no NMPFam hits.
if (!nmp.data || !nmp.data.nmpfam_hits || !nmp.data.nmpfam_hits.length) return null;
const allHits = nmp.data.nmpfam_hits;
// Sort by concordance (n_agree) descending — the most striking metagenomic
// hits are the ones where geom prediction lines up with SAE activation.
const sorted = [...allHits].sort((a, b) => (b.n_agree || 0) - (a.n_agree || 0));
// Top 8 strong-agreement hits in the default expansion
const SHOW = 8;
return (
Showing top {Math.min(SHOW, sorted.length)} hits by n_agree (residues where SAE truth and geom prediction overlap).
Each hit's structure is fetched live from the Fleming Institute NMPFams DB.
bio annotation methods don't apply to unannotated metagenomic sequences
);
}
// Helpers needed by case-study.jsx (the side-by-side detail viewer reuses the
// same activation colouring + Plotly helper). Babel-standalone compiles each
//