created embeddings

This commit is contained in:
George Powell
2025-12-26 00:33:16 -05:00
parent d8cff2ff7a
commit 0daefcb080
8 changed files with 443 additions and 4 deletions

View File

@@ -1,6 +1,9 @@
import type { Handle } from '@sveltejs/kit';
import * as auth from '$lib/server/auth';
import { initializeEmbeddings } from '$lib/server/bible-embeddings';
import { getAllNKJVVerses } from '$lib/server/xml-bible';
const handleAuth: Handle = async ({ event, resolve }) => {
const sessionToken = event.cookies.get(auth.sessionCookieName);
@@ -26,3 +29,7 @@ const handleAuth: Handle = async ({ event, resolve }) => {
};
export const handle: Handle = handleAuth;
// Initialize embeddings on server start (runs once on module load)
const verses = getAllNKJVVerses();
await initializeEmbeddings(verses);

View File

@@ -0,0 +1,98 @@
import { pipeline } from '@xenova/transformers';
import type { FeatureExtractionPipeline, Tensor } from '@xenova/transformers';
import fs from 'fs/promises';
let extractor: FeatureExtractionPipeline | null = null;
const EMBEDDING_DIM = 384;
let verseEmbeddings: Float32Array[] = [];
let verses: Array<{ text: string; book: string; chapter: number; verse: number }> = [];
// Initialize once on server startup
export async function initializeEmbeddings(bibleVerses: Array<{ text: string; book: string; chapter: number; verse: number; }>) {
if (extractor) return; // Already initialized
console.log('Loading embedding model...');
extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L12-v2');
// main ^
// extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); not used
// extractor = await pipeline('feature-extraction', 'Xenova/gte-base'); // testing
verses = bibleVerses;
const CACHE_PATH = './embeddings-cache-L12.json';
// main ^
// const CACHE_PATH = './embeddings-cache-L6.json'; not used
// const CACHE_PATH = './embeddings-cache-GTE-base.json'; // testing
try {
await fs.access(CACHE_PATH);
const cachedStr = await fs.readFile(CACHE_PATH, 'utf-8');
const cached = JSON.parse(cachedStr);
verseEmbeddings = cached.embeddings.map((arr: number[]) => Float32Array.from(arr));
verses = cached.verses;
console.log('Loaded embeddings from cache!');
return;
} catch {
console.log('No cache found, computing embeddings...');
}
console.log(`Encoding ${verses.length} verses in small batches to manage memory...`);
const BATCH_SIZE = 128;
const texts = verses.map((v) => v.text);
verseEmbeddings = [];
for (let start = 0; start < texts.length; start += BATCH_SIZE) {
const batchTexts = texts.slice(start, start + BATCH_SIZE);
console.log(`Processing batch ${Math.floor(start / BATCH_SIZE) + 1} (${batchTexts.length} verses)...`);
const output = await extractor!(batchTexts, { pooling: 'mean', normalize: true });
const data = output.data as Float32Array;
const embeddingDim = EMBEDDING_DIM;
for (let k = 0; k < batchTexts.length; k++) {
verseEmbeddings.push(Float32Array.from(data.slice(k * embeddingDim, (k + 1) * embeddingDim)));
}
}
// Save to cache
const embeddingsData = {
embeddings: verseEmbeddings.map(e => Array.from(e)),
verses: verses
};
await fs.writeFile(CACHE_PATH, JSON.stringify(embeddingsData));
console.log('Embeddings computed and cached to disk!');
}
function cosineSimilarity(a: Float32Array, b: Float32Array): number {
let sum = 0;
for (let i = 0; i < a.length; i++) {
sum += a[i] * b[i];
}
return sum;
}
export async function findSimilarVerses(sentence: string, topK: number = 10) {
if (!extractor || verseEmbeddings.length === 0) {
throw new Error('Embeddings not initialized');
}
if (verseEmbeddings.length !== verses.length) {
throw new Error(`Embeddings/verses length mismatch: ${verseEmbeddings.length} != ${verses.length}`);
}
// Encode query sentence
const queryOutput = await extractor(sentence, { pooling: 'mean', normalize: true });
const queryEmbedding = queryOutput.data as Float32Array;
if (queryEmbedding.length !== EMBEDDING_DIM) {
throw new Error(`Query embedding dim mismatch: ${queryEmbedding.length} != ${EMBEDDING_DIM}`);
}
// Calculate similarities
const scores = verses.map((verse, idx) => ({
...verse,
score: cosineSimilarity(queryEmbedding, verseEmbeddings[idx])
}));
// Sort and return top K
return scores
.sort((a, b) => b.score - a.score)
.slice(0, topK);
}

View File

@@ -0,0 +1,20 @@
import { json } from '@sveltejs/kit';
import { findSimilarVerses } from '$lib/server/bible-embeddings';
import type { RequestHandler } from './$types';
export const POST: RequestHandler = async ({ request }) => {
const formData = await request.json();
const { sentence, topK = 10 }: { sentence: string; topK?: number } = formData;
if (!sentence || typeof sentence !== 'string') {
return json({ error: 'Invalid sentence' }, { status: 400 });
}
try {
const results = await findSimilarVerses(sentence, topK);
return json({ results });
} catch (error) {
console.error('Error finding similar verses:', error);
return json({ error: 'Failed to find similar verses' }, { status: 500 });
}
};

View File

@@ -0,0 +1,214 @@
<script lang="ts">
let sentence = $state("");
let results = $state<
Array<{
book: string;
chapter: number;
verse: number;
text: string;
score: number;
}>
>([]);
let loading = $state(false);
async function searchVerses() {
loading = true;
try {
const response = await fetch("/api/similar-verses", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ sentence, topK: 10 }),
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
}
const data = await response.json();
if (data.error) {
throw new Error(data.error);
}
results = data.results || [];
} catch (error) {
console.error("Search error:", error);
results = [];
} finally {
loading = false;
}
}
</script>
<div class="page">
<h1 class="title">Similar Verse Finder</h1>
<div class="search-section">
<input
bind:value={sentence}
placeholder="Enter a sentence to find similar Bible verses..."
class="input"
/>
<button onclick={searchVerses} disabled={loading} class="button">
{loading ? "Searching..." : "Find Similar Verses"}
</button>
</div>
{#if results.length > 0}
<div class="results">
{#each results as result, i (i)}
<article class="result">
<header>
<strong>{result.book} {result.chapter}:{result.verse}</strong>
<span class="score">Score: {result.score.toFixed(3)}</span>
</header>
<p>{result.text}</p>
</article>
{/each}
</div>
{:else if sentence.trim() && !loading}
<p class="no-results">No similar verses found. Try another sentence!</p>
{/if}
</div>
<style>
.page {
max-width: 900px;
margin: 0 auto;
padding: 1rem 0.75rem;
font-family:
system-ui,
-apple-system,
sans-serif;
}
.title {
text-align: center;
margin-bottom: 1.75rem;
font-size: clamp(2rem, 5vw, 3rem);
color: #2c3e50;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.search-section {
display: flex;
gap: 0.75rem;
margin-bottom: 1.5rem;
flex-wrap: wrap;
}
.input {
flex: 1;
min-width: 300px;
padding: 0.75rem 1rem;
border: 2px solid #e1e5e9;
border-radius: 12px;
font-size: 1.1rem;
transition: all 0.2s ease;
background: #fafbfc;
}
.input:focus {
outline: none;
border-color: #667eea;
box-shadow: 0 0 0 4px rgba(102, 126, 234, 0.1);
background: white;
}
.button {
padding: 0.75rem 1.5rem;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border: none;
border-radius: 12px;
font-size: 1.1rem;
font-weight: 500;
cursor: pointer;
transition: all 0.2s ease;
white-space: nowrap;
}
.button:hover:not(:disabled) {
transform: translateY(-1px);
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.3);
}
.button:disabled {
background: #a0aec0;
cursor: not-allowed;
transform: none;
box-shadow: none;
}
.results {
display: flex;
flex-direction: column;
gap: 1rem;
}
.result {
background: linear-gradient(145deg, #ffffff 0%, #f8fafc 100%);
border: 1px solid #e2e8f0;
border-radius: 16px;
padding: 1.25rem;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
transition: all 0.2s ease;
}
.result:hover {
transform: translateY(-2px);
box-shadow: 0 12px 24px rgba(0, 0, 0, 0.15);
}
.result header {
display: flex;
justify-content: space-between;
align-items: flex-start;
margin-bottom: 0.75rem;
gap: 0.75rem;
}
.result strong {
font-size: 1.3rem;
color: #1a202c;
}
.score {
font-size: 1rem;
color: #718096;
font-weight: 500;
white-space: nowrap;
}
.result p {
margin: 0;
line-height: 1.7;
color: #4a5568;
font-size: 1.1rem;
}
.no-results {
text-align: center;
padding: 1.75rem 0.75rem;
color: #a0aec0;
font-size: 1.2rem;
font-style: italic;
}
@media (max-width: 768px) {
.search-section {
flex-direction: column;
}
.input {
min-width: unset;
}
.result header {
flex-direction: column;
align-items: flex-start;
gap: 0.5rem;
}
}
</style>