Geometry annotates features missing DB labels

Features whose four database / sequence methods (InterPro residue + protein, CATH residue, MEME motif, sequence position) are all not significant, but where the geometric Cα classifier reaches BH q < 0.05. This is the population the paper calls "geometry-primary".

Geometric annotation transfers to metagenomic proteins

For every SAE feature that fires on NMPFams metagenomic clusters, we run the pre-trained Swiss-Prot geometric classifier on each metagenomic protein and report PR-AUC. A feature whose median per-family PR-AUC > 0.5 indicates the geometric annotation generalises beyond the training distribution.

ID	Hits	Strong	Max PR-AUC	Median PR-AUC	Sequences annotated	Top family · max PR-AUC
f/{r.feature_id}	{r.n_hits.toLocaleString()}	{r.n_strong.toLocaleString()}	{fmt(r.max_prauc, 3)}	{fmt(r.median_prauc, 3)}	{(r.sequences_annotated \|\| 0).toLocaleString()}	{top ? `${top.family_id} · ${fmt(top.prauc, 3)} · ${(top.sequence_count \|\| 0).toLocaleString()} seq` : '—'}

Hits

Strong

Max PR-AUC

Median PR-AUC

Sequences annotated

Top family · max PR-AUC

f/{r.feature_id}

{r.n_hits.toLocaleString()}

{r.n_strong.toLocaleString()}

{fmt(r.max_prauc, 3)}

{fmt(r.median_prauc, 3)}

{(r.sequences_annotated || 0).toLocaleString()}

{top ? `${top.family_id} · ${fmt(top.prauc, 3)} · ${(top.sequence_count || 0).toLocaleString()} seq` : '—'}

{group.annotation_name || group.name}

{features.length} member features {cosine != null ? ` · mean pairwise cosine ${fmt(cosine, 3)}` : ''} {cosine != null ? (cosine < 0.5 ? GEOM split (cos < 0.5) : not split by geom) : null} {group.mean_geom_pr_auc != null ? ` · mean geom PR-AUC ${fmt(group.mean_geom_pr_auc, 3)}` : ''} {group.mean_residue_f1 != null ? ` · mean residue F1 ${fmt(group.mean_residue_f1, 3)}` : ''}

; } // 44-dim importance heatmap. Pulls feature_importances per member from the // loaded geometry_enrichment payloads. function CS2ImportanceHeatmap({ features, geomDetails }) { const ref = React.useRef(null); React.useEffect(() => { if (!ref.current || !window.Plotly) return; if (!geomDetails.data) { ref.current.innerHTML = '

Loading per-feature importance vectors…

'; return; } const z = features.map((m, i) => { const det = geomDetails.data[i]; const importances = det?.geometric_residue_level?.feature_importances || {}; return CS2_DESCRIPTORS.map(d => Number(importances[d] || 0)); }); const data = [{ z, x: CS2_DESCRIPTORS, y: features.map(m => `f/${m.feature_id}`), type: "heatmap", colorscale: "YlOrBr", reversescale: true, zmin: 0, colorbar: { title: "importance", thickness: 12 }, hovertemplate: "%{y}
%{x}
%{z:.3f}", }]; const layout = { height: Math.max(360, features.length * 16 + 220), margin: { l: 80, r: 30, t: 10, b: 180 }, xaxis: { side: "bottom", tickangle: -55, tickfont: { size: 8, family: "IBM Plex Mono" }, }, yaxis: { autorange: "reversed", tickfont: { size: 9, family: "IBM Plex Mono" } }, font: { family: "IBM Plex Sans" }, paper_bgcolor: "rgba(0,0,0,0)", plot_bgcolor: "rgba(0,0,0,0)", }; Plotly.react(ref.current, data, layout, { displayModeBar: false, responsive: true }); }, [features, geomDetails.data]); return

; } // Shared-protein overlay: pick a UniProt accession that ≥2 members hit // in their top_proteins; for each covering feature plot its SAE activation // (solid) and geometric probability (dashed) along that protein's residues. function CS2SharedProteinOverlay({ features, geomDetails }) { const sharedByAccession = React.useMemo(() => { if (!geomDetails.data) return []; const map = {}; // accession -> [{feature_id, sequence, sae_act, geom_prob}] geomDetails.data.forEach((det, i) => { const fid = features[i].feature_id; const tps = det?.plot_data?.top_proteins || []; for (const p of tps) { if (!p.accession) continue; if (!map[p.accession]) map[p.accession] = []; map[p.accession].push({ feature_id: fid, sequence: p.sequence, sae_activation_profile: p.sae_activation_profile, geom_prob_profile: p.geom_prob_profile, }); } }); return Object.entries(map) .filter(([, entries]) => entries.length >= 2) .sort((a, b) => b[1].length - a[1].length || a[0].localeCompare(b[0])); }, [features, geomDetails.data]); const [picked, setPicked] = React.useState(null); React.useEffect(() => { if (sharedByAccession.length && !picked) setPicked(sharedByAccession[0][0]); }, [sharedByAccession, picked]); const ref = React.useRef(null); React.useEffect(() => { if (!ref.current || !window.Plotly || !picked) return; const entries = (sharedByAccession.find(([a]) => a === picked) || [, []])[1]; if (!entries.length) return; const length = Math.max(...entries.map(e => (e.sae_activation_profile || []).length)); const x = Array.from({ length }, (_, i) => i + 1); const palette = ["#C97A00", "#5C82BD", "#7C7CC4", "#E07333", "#3B6E9B", "#9B6FB7", "#D04848"]; const traces = []; entries.forEach((e, i) => { const c = palette[i % palette.length]; traces.push({ x, y: e.sae_activation_profile, name: `f/${e.feature_id} · SAE`, type: "scatter", mode: "lines", line: { color: c, width: 1.5 }, yaxis: "y", }); traces.push({ x, y: e.geom_prob_profile, name: `f/${e.feature_id} · Geom`, type: "scatter", mode: "lines", line: { color: c, width: 1.5, dash: "dash" }, yaxis: "y2", }); }); const layout = { height: 420, margin: { l: 60, r: 60, t: 20, b: 40 }, xaxis: { title: "Residue", tickfont: { family: "IBM Plex Mono", size: 10 } }, yaxis: { title: "SAE activation", side: "left", tickfont: { family: "IBM Plex Mono", size: 10 } }, yaxis2: { title: "Geom probability", side: "right", overlaying: "y", range: [0, 1], tickfont: { family: "IBM Plex Mono", size: 10 }, }, legend: { orientation: "h", x: 0, y: -0.18, font: { size: 10, family: "IBM Plex Mono" } }, font: { family: "IBM Plex Sans" }, paper_bgcolor: "rgba(0,0,0,0)", plot_bgcolor: "rgba(0,0,0,0)", }; Plotly.react(ref.current, traces, layout, { displayModeBar: false, responsive: true }); }, [picked, sharedByAccession]); if (geomDetails.loading) return

; if (!sharedByAccession.length) { return (

No protein appears in the top-activating sample of ≥ 2 features in this family.

); } return (

Shared protein

); } // ──────────────────────────────────────────────────────────────────────── // Case study 03 — per-feature side-by-side SwissProt vs NMPFam detail // (the visual idiom in the paper's Figure 4: a Swiss-Prot reference // protein on the left, a metagenomic NMPFam family on the right, each // with structure on top and per-residue activation + geom probability // on the bottom). Picks the strongest hit by default and lets the user // scrub to other strong hits via dropdowns. // ──────────────────────────────────────────────────────────────────────── function CaseStudyMetagenomicDetail({ layer, featureId, onBack, onOpenFeaturePage }) { const transfer = useFetch(() => API.nmpfamTransferSummary(layer).catch(() => null), [layer]); const geom = useFetch(() => API.geometry(layer, featureId).catch(() => null), [layer, featureId]); const nmp = useFetch(() => API.nmpfam(layer, featureId).catch(() => null), [layer, featureId]); const sig = useFetch(() => API.significance(layer, featureId).catch(() => null), [layer, featureId]); if (transfer.loading || geom.loading || nmp.loading || sig.loading) { return

; } const featAgg = (transfer.data?.features || []).find(f => f.feature_id === featureId); const swissProteins = geom.data?.plot_data?.top_proteins || []; const nmpHits = nmp.data?.nmpfam_hits || []; // Prefer hits with definable PR-AUC, sorted by max PR-AUC. Fall back to // raw enrichment order if our transfer summary doesn't have this feature. const ranked = [...nmpHits].sort((a, b) => (b.max_geom_prob || 0) - (a.max_geom_prob || 0)); return (

← back to feature list

Case study · 03 · Layer {layer} · §4.3 f/{featureId}

Geometric annotation transfer · f/{featureId}

Side-by-side: a Swiss-Prot reference protein the SAE feature was originally trained on, and an NMPFam metagenomic family it transfers to. Each panel shows the predicted structure (coloured by activation intensity) and the per-residue SAE activation against the Swiss-Prot-trained GBM's geometric probability.

{/* Per-feature transfer aggregates */} {featAgg && (

Max PR-AUC

{fmt(featAgg.max_prauc, 3)}

best metagenomic transfer

Median PR-AUC

{fmt(featAgg.median_prauc, 3)}

across {featAgg.n_hits.toLocaleString()} NMPFam hits ({featAgg.n_strong} strong)

Sequences annotated

{(featAgg.sequences_annotated || 0).toLocaleString()}

across families with PR-AUC > 0.5

)}

onOpenFeaturePage(featureId)} style={{ color: 'var(--ink)', cursor: 'pointer' }}> → open the full feature page (top SwissProt proteins + activation bins + all NMPFam hits)

{/* Side-by-side comparison */}

); } function CS3SideBySide({ swissProteins, nmpHits, featureMaxAct, nmpThreshold, feat }) { const [swissIdx, setSwissIdx] = React.useState(0); const [nmpIdx, setNmpIdx] = React.useState(0); const swiss = swissProteins[swissIdx] || null; const hit = nmpHits[nmpIdx] || null; if (!swiss && !hit) { return (

No SwissProt top proteins or NMPFam hits available for this feature.

); } // The SwissProt threshold lives on the geometry payload (geometric_residue_level // .activation_threshold) but we only need it to draw the truth band; the Swiss // panel here just shows act vs geom_prob from the pre-computed plot_data, so // we don't need to recompute confusion. return (

); } function CS3Panel({ kind, title, proteins, idx, setIdx, protein, featureMaxAct, feat, nmpThreshold }) { const containerRef = React.useRef(null); const plotRef = React.useRef(null); // Lazy 3D viewer (white→orange activation intensity, same scheme used elsewhere). React.useEffect(() => { if (!protein || !containerRef.current) return; const acc = kind === "swiss" ? protein.accession : protein.family_id; if (!acc) return; let observer, initialized = false; const init = () => { if (initialized || !containerRef.current) return; initialized = true; const acts = (kind === "swiss" ? protein.sae_activation_profile : protein.sae_activation_profile) || []; const norm = featureMaxAct && featureMaxAct > 0 ? featureMaxAct : Math.max(...acts, 0.001); const colorMap = buildActivationColorMap(acts, norm); const url = kind === "swiss" ? `/api/pdb/${acc}` : `/api/nmpfam-pdb/${acc}`; init3DViewerWithMap(containerRef.current, url, colorMap); }; if ("IntersectionObserver" in window) { observer = new IntersectionObserver((entries) => { for (const e of entries) if (e.isIntersecting) { init(); observer.disconnect(); break; } }, { rootMargin: "120px" }); observer.observe(containerRef.current); } else { init(); } return () => { if (observer) observer.disconnect(); }; }, [protein, kind, featureMaxAct]); // Per-residue dual-axis line plot: SAE activation (orange, left axis) + // geom probability (blue, right axis). Vertical band on residues above // the SAE truth threshold so the reader sees where geometry should fire. React.useEffect(() => { if (!plotRef.current || !window.Plotly) return; if (!protein) { plotRef.current.innerHTML = ''; return; } const acts = protein.sae_activation_profile || []; const probs = protein.geom_prob_profile || []; const x = Array.from({ length: acts.length }, (_, i) => i + 1); const traces = [ { x, y: acts, name: 'SAE activation', type: 'scatter', mode: 'lines', line: { color: '#C97A00', width: 1.6 }, yaxis: 'y', }, { x, y: probs, name: 'Geom probability', type: 'scatter', mode: 'lines', line: { color: '#5C82BD', width: 1.6, dash: 'dash' }, yaxis: 'y2', }, ]; // Truth band: residues above the SAE threshold get a faint orange shade. let threshold = 0; if (kind === 'swiss') { // SwissProt threshold isn't in the geometry payload's plot_data, // approximate with feature-level max × 0.5 (consistent with other places // we don't have the exact value). threshold = (featureMaxAct || 0) * 0.5; } else { threshold = nmpThreshold ?? 0; } const shapes = []; if (threshold > 0) { let bandStart = null; for (let i = 0; i < acts.length; i++) { const above = acts[i] > threshold; if (above && bandStart === null) bandStart = i; if ((!above || i === acts.length - 1) && bandStart !== null) { const bandEnd = above ? i : i - 1; shapes.push({ type: 'rect', xref: 'x', yref: 'paper', x0: bandStart + 1, x1: bandEnd + 1, y0: 0, y1: 1, fillcolor: '#C97A00', opacity: 0.10, line: { width: 0 }, }); bandStart = null; } } } const layout = { height: 220, margin: { l: 50, r: 50, t: 10, b: 36 }, xaxis: { title: 'Residue', tickfont: { family: 'IBM Plex Mono', size: 10 } }, yaxis: { title: 'SAE act', side: 'left', tickfont: { family: 'IBM Plex Mono', size: 10 } }, yaxis2: { title: 'Geom prob', side: 'right', overlaying: 'y', range: [0, 1], tickfont: { family: 'IBM Plex Mono', size: 10 } }, legend: { orientation: 'h', x: 0, y: -0.32, font: { size: 10, family: 'IBM Plex Mono' } }, font: { family: 'IBM Plex Sans' }, shapes, paper_bgcolor: 'rgba(0,0,0,0)', plot_bgcolor: 'rgba(0,0,0,0)', }; Plotly.react(plotRef.current, traces, layout, { displayModeBar: false, responsive: true }); }, [protein, kind, featureMaxAct, nmpThreshold]); if (!protein) { return (

{title}

No {kind === 'swiss' ? 'SwissProt' : 'NMPFam'} protein available.

); } const acc = kind === 'swiss' ? protein.accession : protein.family_id; const len = (protein.sequence || '').length; const maxAct = protein.max_sae_activation ?? Math.max(...(protein.sae_activation_profile || [0])); return (

{title}

{kind === 'swiss' ? ( {acc} · {len} aa ) : ( {acc} · {len} aa · {protein.category} · {' '} fleming.gr → )} max act {fmt(maxAct, 2)}

{kind === 'swiss' ? `3Dmol cartoon · /api/pdb/${acc}` : `ESMFold cartoon · /api/nmpfam-pdb/${acc}`}

); } window.CaseStudyGeometry = CaseStudyGeometry; window.CaseStudyGranularity = CaseStudyGranularity; window.CaseStudyMetagenomic = CaseStudyMetagenomic; window.CaseStudyMetagenomicDetail = CaseStudyMetagenomicDetail; window.CaseStudyFamilyDetail = CaseStudyFamilyDetail;

Geometry annotates features missing DB labels

Geometry-primary feature list

Geometry is more granular than biology

Transfer to metagenomic proteins

Geometric annotation transfers to metagenomic proteins

{features.length.toLocaleString()} features pass the Table 4 column 3 gate

{group.annotation_name || group.name}

Feature list

How similar are these features in the 44-dim importance space?

Where each member sits in the geometric descriptor space

Different members hit different residues on the same protein

Geometric annotation transfer · f/{featureId}