Nicholai 2bc680ca63 refactor: restructure into monorepo
Move flat src/ layout into packages/ monorepo:
- packages/core: scraping, embeddings, storage, clustering, analysis
- packages/cli: CLI and TUI interface
- packages/web: Next.js web dashboard

Add playwright screenshots, sqlite storage, and settings.
2026-01-24 00:12:14 -07:00

42 lines
1.1 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import type { ProblemSummary } from './types'
export interface SimilarityMatrix {
matrix: number[][]
labels: string[]
clusterIds: number[]
}
/**
* compute jaccard similarity between clusters based on keywords
* jaccard = |A ∩ B| / |A B|
*/
export function computeKeywordSimilarity(clusters: ProblemSummary[]): SimilarityMatrix {
const n = clusters.length
const matrix: number[][] = Array(n).fill(null).map(() => Array(n).fill(0))
for (let i = 0; i < n; i++) {
for (let j = i; j < n; j++) {
if (i === j) {
matrix[i][j] = 1
continue
}
const setA = new Set(clusters[i].keywords.map(k => k.toLowerCase()))
const setB = new Set(clusters[j].keywords.map(k => k.toLowerCase()))
const intersection = [...setA].filter(k => setB.has(k)).length
const union = new Set([...setA, ...setB]).size
const similarity = union > 0 ? intersection / union : 0
matrix[i][j] = similarity
matrix[j][i] = similarity
}
}
return {
matrix,
labels: clusters.map(c => c.problem.slice(0, 25) + (c.problem.length > 25 ? '...' : '')),
clusterIds: clusters.map(c => c.clusterId),
}
}