package Brain; # Brain.pm — Persistent semantic memory for Claude Code # # Author: Michael Limberger # Title: Senior Web & Linux Administrator, IITS, Faculty of Dentistry, # University of Toronto # # Self-contained Perl module — no external dependencies on sibling tools. # # Architecture: # - SQLite database stores text chunks with metadata # - Ollama bge-m3 generates embedding vectors for each chunk # - PDL stores vectors in a fast binary matrix on disk # - Cosine similarity finds semantically relevant chunks # # The key insight from the original RAG system: semantic search beats # keyword search because you can ask "what do we know about performance?" # and find chunks about "latency", "throughput", "slow queries" even # though those exact words weren't in the query. use strict; use warnings; use Mojo::UserAgent; use Mojo::SQLite; use Mojo::JSON qw(encode_json decode_json); use File::Path qw(make_path); use File::Find qw(find); use File::Basename qw(basename dirname); use Carp qw(croak); use PDL; use PDL::IO::FastRaw; # --------------------------------------------------------------- # Constructor # --------------------------------------------------------------- sub new { my ($class, %opts) = @_; my $brain_dir = $opts{brain_dir} // "$ENV{HOME}/.brain"; my $ollama_url = $opts{ollama_url} // 'http://localhost:11434'; my $model = $opts{model} // 'bge-m3'; make_path($brain_dir) unless -d $brain_dir; my $db_path = "$brain_dir/brain.db"; my $sql = Mojo::SQLite->new("sqlite:$db_path"); my $ua = Mojo::UserAgent->new( connect_timeout => 10, inactivity_timeout => 120, request_timeout => 120, max_response_size => 0, ); my $self = bless { brain_dir => $brain_dir, vec_path => "$brain_dir/vectors", ollama_url => $ollama_url, model => $model, chunk_size => $opts{chunk_size} // 512, # tokens per chunk overlap => $opts{overlap} // 64, # overlap tokens batch_size => $opts{batch_size} // 32, # embeddings per batch limit => $opts{limit} // 5, # default search results sql => $sql, ua => $ua, vectors => undef, verbose => $opts{verbose} // 0, }, $class; $self->_init_db(); return $self; } # --------------------------------------------------------------- # Database Schema # --------------------------------------------------------------- sub _init_db { my ($self) = @_; my $db = $self->{sql}->db; $db->query('PRAGMA journal_mode = WAL'); $db->query('PRAGMA synchronous = NORMAL'); # Chunks — the core content store $db->query(<<'SQL'); CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, source TEXT NOT NULL, source_type TEXT NOT NULL, section TEXT, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, tokens INTEGER, created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')) ) SQL $db->query(<<'SQL'); CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source) SQL # Sources — tracking what's been indexed $db->query(<<'SQL'); CREATE TABLE IF NOT EXISTS sources ( id INTEGER PRIMARY KEY AUTOINCREMENT, path TEXT NOT NULL UNIQUE, doc_type TEXT NOT NULL, num_chunks INTEGER NOT NULL, indexed_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')) ) SQL # Memories — quick key-value thoughts (not chunked, directly stored) $db->query(<<'SQL'); CREATE TABLE IF NOT EXISTS memories ( id INTEGER PRIMARY KEY AUTOINCREMENT, category TEXT NOT NULL DEFAULT 'general', tags TEXT, content TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')) ) SQL # Migrate existing DBs: add tags column if missing eval { $db->query('SELECT tags FROM memories LIMIT 0'); }; if ($@) { eval { $db->query('ALTER TABLE memories ADD COLUMN tags TEXT') }; } $db->query(<<'SQL'); CREATE INDEX IF NOT EXISTS idx_memories_category ON memories(category) SQL # Log — session activity journal $db->query(<<'SQL'); CREATE TABLE IF NOT EXISTS log ( id INTEGER PRIMARY KEY AUTOINCREMENT, entry TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')) ) SQL } # --------------------------------------------------------------- # Public API: Store # --------------------------------------------------------------- # Store a quick memory (embeds incrementally — no full rebuild) sub store { my ($self, $content, %opts) = @_; my $category = $opts{category} // 'general'; my $tags = $opts{tags}; croak "content required" unless $content && length($content) > 0; my $db = $self->{sql}->db; $db->query( 'INSERT INTO memories (category, tags, content) VALUES (?, ?, ?)', $category, $tags, $content, ); # Also insert as a chunk so it's searchable via embeddings my $source = "memory:$category:" . time(); $db->query( 'INSERT INTO chunks (source, source_type, section, chunk_index, content, tokens) VALUES (?, ?, ?, ?, ?, ?)', $source, 'memory', $category, 0, $content, _estimate_tokens($content), ); $db->query( 'INSERT OR REPLACE INTO sources (path, doc_type, num_chunks) VALUES (?, ?, ?)', $source, 'memory', 1, ); # Incremental embed — append to existing matrix instead of rebuilding $self->_embed_single($content); $self->_verbose("Stored memory [$category]: " . _truncate($content, 80)); return 1; } # --------------------------------------------------------------- # Public API: Search # --------------------------------------------------------------- sub search { my ($self, $query, $limit) = @_; $limit //= $self->{limit}; my $count = $self->{sql}->db->query('SELECT COUNT(*) AS n FROM chunks')->hash->{n}; return [] unless $count > 0; # Load vectors if needed $self->_load_vectors() unless defined $self->{vectors}; return [] unless defined $self->{vectors}; # Embed the query my $embeddings = $self->_embed($query); return [] unless $embeddings && @$embeddings; my $query_vec = pdl(float, $embeddings->[0]); return $self->_cosine_search($query_vec, $limit); } # --------------------------------------------------------------- # Public API: Index a file or directory # --------------------------------------------------------------- sub index_path { my ($self, $path) = @_; croak "Path not found: $path" unless -e $path; my @files; if (-d $path) { find({ wanted => sub { return unless -f $_; return if /\.(gz|bz2|xz|zip|tar|png|jpg|gif|pdf|bin|o|so|dylib|db|hdr)$/i; return if basename($_) =~ /^\./; # skip hidden files push @files, $_; }, no_chdir => 1, }, $path); } else { @files = ($path); } my $total_chunks = 0; for my $file (@files) { my $count = $self->_index_file($file); $total_chunks += $count; $self->_verbose("Indexed $file ($count chunks)") if $self->{verbose}; } $self->_rebuild_vectors() if $total_chunks > 0; return { files => scalar(@files), chunks => $total_chunks }; } # --------------------------------------------------------------- # Public API: Log an entry # --------------------------------------------------------------- sub log_entry { my ($self, $entry) = @_; croak "entry required" unless $entry && length($entry) > 0; $self->{sql}->db->query('INSERT INTO log (entry) VALUES (?)', $entry); return 1; } # Get recent log entries sub log_recent { my ($self, $limit) = @_; $limit //= 20; return $self->{sql}->db->query( 'SELECT id, entry, created_at FROM log ORDER BY id DESC LIMIT ?', $limit )->hashes->to_array; } # --------------------------------------------------------------- # Public API: Forget (remove a source) # --------------------------------------------------------------- sub forget { my ($self, $source_pattern) = @_; croak "source pattern required" unless $source_pattern; my $db = $self->{sql}->db; # Find matching sources my $sources = $db->query( 'SELECT path FROM sources WHERE path LIKE ?', "%$source_pattern%" )->hashes->to_array; my $removed = 0; for my $s (@$sources) { $db->query('DELETE FROM chunks WHERE source = ?', $s->{path}); $db->query('DELETE FROM sources WHERE path = ?', $s->{path}); $removed++; $self->_verbose("Forgot: $s->{path}"); } $self->_rebuild_vectors() if $removed > 0; return $removed; } # --------------------------------------------------------------- # Public API: Status # --------------------------------------------------------------- sub status { my ($self) = @_; my $db = $self->{sql}->db; my $sources = $db->query('SELECT COUNT(*) AS n FROM sources')->hash->{n}; my $chunks = $db->query('SELECT COUNT(*) AS n FROM chunks')->hash->{n}; my $memories = $db->query('SELECT COUNT(*) AS n FROM memories')->hash->{n}; my $logs = $db->query('SELECT COUNT(*) AS n FROM log')->hash->{n}; my $latest = $db->query('SELECT MAX(indexed_at) AS t FROM sources')->hash->{t}; my $by_type = $db->query( 'SELECT doc_type AS source_type, COUNT(*) AS sources, SUM(num_chunks) AS chunks FROM sources GROUP BY doc_type' )->hashes->to_array; my $by_category = $db->query( 'SELECT category, COUNT(*) AS n FROM memories GROUP BY category ORDER BY n DESC' )->hashes->to_array; my $vec_exists = -f "$self->{vec_path}" || -f "$self->{vec_path}.hdr"; my $db_size = -f "$self->{brain_dir}/brain.db" ? sprintf("%.1f KB", (-s "$self->{brain_dir}/brain.db") / 1024) : '0 KB'; return { brain_dir => $self->{brain_dir}, db_size => $db_size, sources => $sources, chunks => $chunks, memories => $memories, log_entries => $logs, last_indexed => $latest // 'never', vectors_built => $vec_exists ? 1 : 0, embed_model => $self->{model}, ollama_url => $self->{ollama_url}, by_type => $by_type, by_category => $by_category, }; } # --------------------------------------------------------------- # Public API: List memories # --------------------------------------------------------------- sub memories { my ($self, %opts) = @_; my $category = $opts{category}; my $limit = $opts{limit} // 50; my $db = $self->{sql}->db; if ($category) { return $db->query( 'SELECT id, category, tags, content, created_at FROM memories WHERE category = ? ORDER BY id DESC LIMIT ?', $category, $limit )->hashes->to_array; } return $db->query( 'SELECT id, category, tags, content, created_at FROM memories ORDER BY id DESC LIMIT ?', $limit )->hashes->to_array; } # --------------------------------------------------------------- # Ollama Embeddings # --------------------------------------------------------------- sub _embed { my ($self, $input) = @_; my $payload = { model => $self->{model}, input => ref $input eq 'ARRAY' ? $input : [$input], }; my $url = "$self->{ollama_url}/api/embed"; my $tx = $self->{ua}->post($url => json => $payload); my $err = $tx->error; if ($err) { if ($err->{code}) { warn "Embedding failed: HTTP $err->{code} from $self->{ollama_url}\n"; } else { warn "Embedding failed: Cannot connect to Ollama at $self->{ollama_url} — $err->{message}\n"; warn " Is Ollama running? Try: ollama serve\n"; warn " Is the model pulled? Try: ollama pull $self->{model}\n"; } return undef; } my $res = $tx->result; unless ($res && $res->is_success) { warn "Embedding failed: unexpected response from $self->{ollama_url}\n"; return undef; } my $data = $res->json; return $data->{embeddings}; } # --------------------------------------------------------------- # Chunking # --------------------------------------------------------------- sub _index_file { my ($self, $path) = @_; open my $fh, '<', $path or return 0; local $/; my $content = <$fh>; close $fh; return 0 unless $content && length($content) > 20; my $type = _detect_type($path); my @chunks; if ($type eq 'markdown') { @chunks = $self->_chunk_markdown($content); } elsif ($type eq 'config') { @chunks = $self->_chunk_config($content); } else { @chunks = $self->_chunk_plain($content); } return 0 unless @chunks; # Remove old if exists $self->_remove_source($path); my $db = $self->{sql}->db; for my $i (0 .. $#chunks) { $db->query( 'INSERT INTO chunks (source, source_type, section, chunk_index, content, tokens) VALUES (?, ?, ?, ?, ?, ?)', $path, $type, $chunks[$i]{section}, $i, $chunks[$i]{text}, _estimate_tokens($chunks[$i]{text}), ); } $db->query( 'INSERT OR REPLACE INTO sources (path, doc_type, num_chunks) VALUES (?, ?, ?)', $path, $type, scalar @chunks, ); return scalar @chunks; } sub _detect_type { my ($path) = @_; my $base = basename($path); return 'markdown' if $base =~ /\.md$/i; return 'config' if $base =~ /\.(conf|yaml|yml|ini|cfg|toml)$/i; return 'plain'; # Perl files: treat as plain text (no Perl-specific chunker yet) } sub _chunk_markdown { my ($self, $text) = @_; my @chunks; # Split on ## headings only — keep ### subsections with their parent. # This produces coherent, self-contained chunks instead of fragments. my @sections = split /^(?=##\s)/m, $text; for my $section (@sections) { next unless $section && $section =~ /\S/; # Extract the ## heading as the section label my ($heading) = $section =~ /^(##\s+.+)/; $heading //= 'untitled'; $heading =~ s/^#+\s*//; $heading =~ s/\s+$//; if (_estimate_tokens($section) > $self->{chunk_size} * 1.5) { # Section is too large — split at ### boundaries within it my @sub = $self->_split_at_subheadings($section, $heading); push @chunks, @sub; } else { push @chunks, { text => $section, section => $heading }; } } @chunks = $self->_chunk_plain($text) unless @chunks; return @chunks; } sub _split_at_subheadings { my ($self, $text, $parent_heading) = @_; my @sub_chunks; my @parts = split /^(?=###\s)/m, $text; my @current; my $current_tokens = 0; my $current_heading = $parent_heading; for my $part (@parts) { next unless $part && $part =~ /\S/; my $part_tokens = _estimate_tokens($part); # If adding this part would exceed the limit, flush current if ($current_tokens + $part_tokens > $self->{chunk_size} * 1.5 && @current) { push @sub_chunks, { text => join("", @current), section => $current_heading, }; @current = (); $current_tokens = 0; } # Update heading label from ### if present if ($part =~ /^###\s+(.+)/ && !@current) { $current_heading = "$parent_heading > $1"; $current_heading =~ s/\s+$//; } elsif (!@current) { $current_heading = $parent_heading; } push @current, $part; $current_tokens += $part_tokens; } if (@current) { push @sub_chunks, { text => join("", @current), section => $current_heading, }; } # Fallback to _split_large if no ### boundaries found return @sub_chunks if @sub_chunks; return $self->_split_large($text, $parent_heading); } sub _chunk_config { my ($self, $text) = @_; my @chunks; if ($text =~ /^\[/m) { my @sections = split /^(?=\[)/m, $text; for my $section (@sections) { next unless $section =~ /\S/; my ($heading) = $section =~ /^\[([^\]]+)\]/; $heading //= 'header'; push @chunks, { text => $section, section => $heading }; } } elsif ($text =~ /^\w+:/m) { my @sections = split /^(?=\w+:)/m, $text; for my $section (@sections) { next unless $section =~ /\S/; my ($heading) = $section =~ /^(\w+):/; $heading //= 'block'; push @chunks, { text => $section, section => $heading }; } } else { @chunks = $self->_chunk_plain($text); } return @chunks; } sub _chunk_plain { my ($self, $text) = @_; my @lines = split /\n/, $text; my @chunks; my @current; my $current_tokens = 0; for my $line (@lines) { my $line_tokens = _estimate_tokens($line); if ($current_tokens + $line_tokens > $self->{chunk_size} && @current) { push @chunks, { text => join("\n", @current), section => 'part-' . (scalar(@chunks) + 1), }; my $overlap_tokens = 0; my @overlap; for my $l (reverse @current) { last if $overlap_tokens >= $self->{overlap}; unshift @overlap, $l; $overlap_tokens += _estimate_tokens($l); } @current = @overlap; $current_tokens = $overlap_tokens; } push @current, $line; $current_tokens += $line_tokens; } if (@current) { push @chunks, { text => join("\n", @current), section => 'part-' . (scalar(@chunks) + 1), }; } return @chunks; } sub _split_large { my ($self, $text, $heading) = @_; my @sub_chunks; my @lines = split /\n/, $text; my @current; my $current_tokens = 0; my $part = 1; for my $line (@lines) { my $line_tokens = _estimate_tokens($line); if ($current_tokens + $line_tokens > $self->{chunk_size} && @current) { push @sub_chunks, { text => join("\n", @current), section => "$heading (part $part)", }; $part++; my $overlap_tokens = 0; my @overlap; for my $l (reverse @current) { last if $overlap_tokens >= $self->{overlap}; unshift @overlap, $l; $overlap_tokens += _estimate_tokens($l); } @current = @overlap; $current_tokens = $overlap_tokens; } push @current, $line; $current_tokens += $line_tokens; } if (@current) { push @sub_chunks, { text => join("\n", @current), section => "$heading (part $part)", }; } return @sub_chunks; } # --------------------------------------------------------------- # Public API: Rebuild vectors # --------------------------------------------------------------- sub rebuild { my ($self) = @_; return $self->_rebuild_vectors(); } # --------------------------------------------------------------- # Vector Storage & Search # --------------------------------------------------------------- # Embed a single text and append to existing vector matrix (incremental) sub _embed_single { my ($self, $text) = @_; my $embeddings = $self->_embed($text); return unless $embeddings && @$embeddings; my $new_vec = pdl(float, $embeddings->[0]); if (defined $self->{vectors}) { # Append: glue new vector onto existing matrix $self->{vectors} = $self->{vectors}->glue(1, $new_vec->dummy(1)); } else { # First vector — try loading from disk, or start fresh $self->_load_vectors(); if (defined $self->{vectors}) { $self->{vectors} = $self->{vectors}->glue(1, $new_vec->dummy(1)); } else { $self->{vectors} = $new_vec->dummy(1); } } # Persist to disk writefraw($self->{vectors}, $self->{vec_path}); return 1; } sub _rebuild_vectors { my ($self) = @_; my $chunks = $self->{sql}->db->query( 'SELECT id, content FROM chunks ORDER BY id' )->hashes->to_array; return unless @$chunks; print " Embedding " . scalar(@$chunks) . " chunks...\n" if $self->{verbose}; my @all_vectors; my $batch_size = $self->{batch_size}; for (my $i = 0; $i < @$chunks; $i += $batch_size) { my $end = $i + $batch_size - 1; $end = $#$chunks if $end > $#$chunks; my @texts = map { $_->{content} } @{$chunks}[$i .. $end]; my $embeddings = $self->_embed(\@texts); unless ($embeddings && @$embeddings) { warn "Embedding failed at batch $i\n"; return; } push @all_vectors, @$embeddings; print " Embedded " . ($end + 1) . "/" . scalar(@$chunks) . "\n" if $self->{verbose}; } my $matrix = pdl(float, \@all_vectors); writefraw($matrix, $self->{vec_path}); $self->{vectors} = $matrix; } sub _load_vectors { my ($self) = @_; my $hdr = "$self->{vec_path}.hdr"; return unless -f $hdr; $self->{vectors} = readfraw($self->{vec_path}); } sub _cosine_search { my ($self, $query_vec, $limit) = @_; my $matrix = $self->{vectors}; return [] unless defined $matrix; my ($dims, $num_docs) = $matrix->dims; return [] unless $num_docs > 0; my $dot_products = ($matrix * $query_vec->dummy(1))->sumover; my $doc_magnitudes = $matrix->pow(2)->sumover->sqrt; my $query_magnitude = $query_vec->pow(2)->sumover->sqrt; my $scores = $dot_products / ($doc_magnitudes * $query_magnitude + 1e-10); $limit = $num_docs if $limit > $num_docs; my $sorted_indices = $scores->qsorti->slice("-1:0"); my $top_k_indices = $sorted_indices->slice("0:" . ($limit - 1)); my $top_k_scores = $scores->index($top_k_indices); my @results; my $db = $self->{sql}->db; for my $i (0 .. $limit - 1) { my $chunk_idx = $top_k_indices->at($i); my $score = $top_k_scores->at($i); my $chunk = $db->query( 'SELECT id, source, source_type, section, content FROM chunks ORDER BY id LIMIT 1 OFFSET ?', $chunk_idx, )->hash; next unless $chunk; push @results, { score => sprintf("%.4f", $score), source => $chunk->{source}, source_type => $chunk->{source_type}, section => $chunk->{section}, content => $chunk->{content}, }; } return \@results; } # --------------------------------------------------------------- # Helpers # --------------------------------------------------------------- sub _remove_source { my ($self, $path) = @_; my $db = $self->{sql}->db; my $existing = $db->query('SELECT id FROM sources WHERE path = ?', $path)->hash; return 0 unless $existing; $db->query('DELETE FROM chunks WHERE source = ?', $path); $db->query('DELETE FROM sources WHERE path = ?', $path); return 1; } sub _verbose { my ($self, $msg) = @_; print " $msg\n" if $self->{verbose}; } sub _estimate_tokens { my ($text) = @_; return int(length($text // '') / 4); } sub _truncate { my ($text, $len) = @_; $len //= 80; return length($text) > $len ? substr($text, 0, $len) . '...' : $text; } 1; __END__ =head1 NAME Brain — Persistent semantic memory for Claude Code =head1 SYNOPSIS use Brain; my $brain = Brain->new( brain_dir => "$ENV{HOME}/.brain", ollama_url => 'http://localhost:11434', model => 'bge-m3', ); # Store a memory $brain->store("Ollama bge-m3 is best for embeddings", category => 'models'); # Search semantically my $results = $brain->search("what embedding model should I use?"); # Index documents $brain->index_path("/path/to/knowledge/"); # Check status my $status = $brain->status(); =head1 DESCRIPTION A self-contained semantic memory system using SQLite for storage, Ollama bge-m3 for embeddings, and PDL for fast vector math. The brain stores text as chunks, embeds them as vectors, and finds relevant content via cosine similarity search. This means you can search by meaning, not just keywords. =head1 TABLES =over 4 =item B — Text content split into searchable pieces with embeddings =item B — Tracking what files/documents have been indexed =item B — Quick key-value thoughts stored directly =item B — Session activity journal =back =head1 AUTHOR Mike Limberger / Claude =cut