#!/usr/bin/env perl # ============================================================================= # ragproxy-ingest — Ingest documents into RAG Proxy brains # ============================================================================= # # Author: Michael Limberger # Title: Senior Web & Linux Administrator, IITS, Faculty of Dentistry, # University of Toronto # # High-level CLI for managing brain content. Wraps Brain.pm to provide # a friendly interface for adding, removing, and listing documents. # # USAGE # ragproxy-ingest add Ingest files into a brain # ragproxy-ingest remove Remove docs matching pattern # ragproxy-ingest list [brain] List brains or brain contents # ragproxy-ingest create Create a new empty brain # ragproxy-ingest rebuild Rebuild vectors for a brain # ragproxy-ingest search Test search against a brain # ragproxy-ingest help This message # # EXAMPLES # ragproxy-ingest add dental ~/docs/policies/ # ragproxy-ingest add toad ./new-chapter.md # ragproxy-ingest create hr-policies # ragproxy-ingest list dental # ragproxy-ingest search dental "needlestick protocol" # ragproxy-ingest remove dental old-policy.md # # ============================================================================= use strict; use warnings; use FindBin;use File::Basename qw(basename dirname); use Cwd qw(abs_path); use lib "$FindBin::RealBin/../lib"; use lib "$ENV{HOME}/perl5/lib/perl5"; # ============================================================================= # CONFIGURATION # ============================================================================= my $RAGPROXY_HOME = $ENV{RAGPROXY_HOME} // dirname($FindBin::RealBin); my $DATA_DIR = "$RAGPROXY_HOME/data"; my $CORPUS_DIR = "$RAGPROXY_HOME/corpus"; # ============================================================================= # COLOUR OUTPUT # ============================================================================= my $GREEN = "\033[32m"; my $RED = "\033[31m"; my $YELLOW = "\033[33m"; my $CYAN = "\033[36m"; my $DIM = "\033[90m"; my $BOLD = "\033[1m"; my $RESET = "\033[0m"; if (! -t STDOUT) { $GREEN = $RED = $YELLOW = $CYAN = $DIM = $BOLD = $RESET = ''; } sub info { print "${GREEN}[OK]${RESET} $_[0]\n"; } sub warn_ { print "${YELLOW}[WARN]${RESET} $_[0]\n"; } sub err { print "${RED}[ERR]${RESET} $_[0]\n"; } sub head { print "\n${BOLD}${CYAN}$_[0]${RESET}\n\n"; } sub dim { print "${DIM}$_[0]${RESET}\n"; } # ============================================================================= # BRAIN HELPERS # ============================================================================= # Resolve brain name to directory path sub brain_dir { my ($name) = @_; $name =~ s/-brain$//; # normalize return "$DATA_DIR/${name}-brain"; } # Check if a brain exists sub brain_exists { my ($name) = @_; my $dir = brain_dir($name); return (-d $dir && -f "$dir/brain.db"); } # List all brain names sub list_brains{ return () unless -d $DATA_DIR; opendir my $dh, $DATA_DIR or return (); my @brains = sort map { s/-brain$//; $_ } grep { -d "$DATA_DIR/$_" && /-brain$/ } readdir $dh; closedir $dh; return @brains; } # Load a Brain instance for a given brain name sub load_brain { my ($name) = @_; my $dir = brain_dir($name); unless (-d $dir) { err("Brain '$name' not found at $dir"); exit 1; } require Brain; return Brain->new( brain_dir => $dir, ollama_url => $ENV{OLLAMA_URL} // 'http://localhost:11434', model => $ENV{EMBED_MODEL} // 'bge-m3', ); } # ============================================================================= # COMMANDS # ============================================================================= sub cmd_add { my ($brain_name, @paths) = @_; unless ($brain_name && @paths) { err("Usage: ragproxy-ingest add [path ...]"); exit 1; } unless (brain_exists($brain_name)) { err("Brain '$brain_name' does not exist. Create it first:"); dim(" ragproxy-ingest create $brain_name"); exit 1; } head("Ingesting into '$brain_name' brain"); my $brain = load_brain($brain_name); my $total_files = 0; my $total_chunks = 0; for my $path (@paths) { unless (-e $path) { err("Path not found: $path"); next; } $path = abs_path($path); if (-d $path) { # Count files first my @files = glob("$path/*"); my $count = grep { -f $_ } @files; dim(" Scanning directory: $path ($count files)"); } else { dim(" File: $path"); } my $result = $brain->index_path($path); $total_files += $result->{files}; $total_chunks += $result->{chunks}; info("Indexed $result->{files} file(s), $result->{chunks} chunk(s) from $path"); } print "\n"; info("Total: $total_files file(s), $total_chunks chunk(s) ingested into '$brain_name'"); # Show updated status my $status = $brain->status(); dim(" Brain now has $status->{chunks} total chunks from $status->{sources} sources"); print "\n"; } sub cmd_remove { my ($brain_name, $pattern) = @_; unless ($brain_name && $pattern) { err("Usage: ragproxy-ingest remove "); dim(" Pattern matches against source file paths"); exit 1; } unless (brain_exists($brain_name)) { err("Brain '$brain_name' not found"); exit 1; } head("Removing from '$brain_name' brain"); my $brain = load_brain($brain_name); my $removed = $brain->forget($pattern); if ($removed > 0) { info("Removed $removed source(s) matching '$pattern'"); dim(" Vectors rebuilt automatically"); } else { warn_("No sources matched '$pattern'"); } print "\n"; } sub cmd_list { my ($brain_name) = @_; if ($brain_name) { # List contents of a specific brain unless (brain_exists($brain_name)) { err("Brain '$brain_name' not found"); exit 1; } head("Brain: $brain_name"); my $brain = load_brain($brain_name); my $status = $brain->status(); printf " %-14s %d\n", "Sources:", $status->{sources}; printf " %-14s %d\n", "Chunks:", $status->{chunks}; printf " %-14s %s\n", "DB size:", $status->{db_size}; printf " %-14s %s\n", "Vectors:", $status->{vectors_built} ? "${GREEN}built${RESET}" : "${YELLOW}not built${RESET}"; printf " %-14s %s\n", "Last indexed:", $status->{last_indexed} // 'never'; # List sources if ($status->{sources} > 0) { print "\n ${BOLD}Sources:${RESET}\n"; my $db = $brain->{sql}->db; my $sources = $db->query( 'SELECT source, COUNT(*) as chunks FROM chunks GROUP BY source ORDER BY source' )->hashes->to_array; for my $s (@$sources) { my $short = $s->{source}; $short =~ s|.*/||; # basename printf " %-40s %d chunks\n", $short, $s->{chunks}; } } # Show types breakdown if available if ($status->{by_type} && @{$status->{by_type}}) { print "\n ${BOLD}By type:${RESET}\n"; for my $t (@{$status->{by_type}}) { printf " %-12s %d source(s), %d chunk(s)\n", $t->{source_type}, $t->{sources}, $t->{chunks} // 0; } } print "\n"; } else { # List all brains head("Available Brains"); my @brains = list_brains(); if (@brains) { for my $name (@brains) { my $dir = brain_dir($name); my $db_path = "$dir/brain.db"; if (-f $db_path) { my $count = `sqlite3 '$db_path' 'SELECT COUNT(*) FROM chunks;' 2>/dev/null`; chomp $count if defined $count; $count //= '?'; my $size = -s $db_path; $size = sprintf("%.1f KB", $size / 1024); my $has_prompt = (-f "$dir/system-prompt.txt") ? " ${DIM}(has system prompt)${RESET}" : ""; printf " ${CYAN}%-16s${RESET} %s chunks %s%s\n", $name, $count, $size, $has_prompt; } else { printf " ${YELLOW}%-16s${RESET} (not indexed)\n", $name; } } } else { warn_("No brains found in $DATA_DIR"); dim(" Create one: ragproxy-ingest create "); } print "\n"; } } sub cmd_create { my ($brain_name) = @_; unless ($brain_name) { err("Usage: ragproxy-ingest create "); exit 1; } # Sanitize name $brain_name =~ s/[^a-zA-Z0-9_-]//g; $brain_name = lc $brain_name; my $dir = brain_dir($brain_name); if (-d $dir && -f "$dir/brain.db") { warn_("Brain '$brain_name' already exists at $dir"); exit 1; } head("Creating brain: $brain_name"); # Create directory mkdir $dir or do { err("Cannot create directory: $dir ($!)"); exit 1; }; # Initialize with Brain.pm (creates the SQLite schema) require Brain; my $brain = Brain->new( brain_dir => $dir, ollama_url => $ENV{OLLAMA_URL} // 'http://localhost:11434', model => $ENV{EMBED_MODEL} // 'bge-m3', ); info("Created brain '$brain_name' at $dir"); dim(" Next: ragproxy-ingest add $brain_name /path/to/documents"); # Also create corpus directory for source documents my $corpus = "$CORPUS_DIR/$brain_name"; unless (-d $corpus) { mkdir $corpus; dim(" Corpus directory: $corpus"); } print "\n"; } sub cmd_rebuild { my ($brain_name) = @_; unless ($brain_name) { err("Usage: ragproxy-ingest rebuild "); exit 1; } unless (brain_exists($brain_name)) { err("Brain '$brain_name' not found"); exit 1; } head("Rebuilding vectors for '$brain_name'"); my $brain = load_brain($brain_name); $brain->rebuild(); info("Vectors rebuilt for '$brain_name'"); dim(" Restart the proxy to pick up changes: ragproxyctl restart"); print "\n"; } sub cmd_search { my ($brain_name, @query_words) = @_; my $query = join(' ', @query_words); unless ($brain_name && $query) { err("Usage: ragproxy-ingest search "); exit 1; } unless (brain_exists($brain_name)) { err("Brain '$brain_name' not found"); exit 1; } head("Searching '$brain_name' for: $query"); my $brain = load_brain($brain_name); my $results = $brain->search($query, 5); if ($results && @$results) { for my $i (0 .. $#$results) { my $r = $results->[$i]; my $score = sprintf("%.4f", $r->{score}); my $source = $r->{source} // 'unknown'; $source =~ s|.*/||; # basename print " ${BOLD}" . ($i + 1) . ".${RESET} "; print "${CYAN}$score${RESET} ${DIM}($source)${RESET}\n"; # Show first 200 chars of content my $preview = $r->{content} // ''; $preview =~ s/\s+/ /g; $preview = substr($preview, 0, 200) . '...' if length($preview) > 200; print " $preview\n\n"; } } else { warn_("No results found"); } print "\n"; } sub cmd_help { print <<"HELP"; ${BOLD}ragproxy-ingest${RESET} — Document ingestion for RAG Proxy brains ${BOLD}USAGE${RESET} ragproxy-ingest [args] ${BOLD}COMMANDS${RESET} ${CYAN}add${RESET} [path ...] Ingest files/directories into a brain ${CYAN}remove${RESET} Remove sources matching pattern ${CYAN}list${RESET} [brain] List all brains, or details of one ${CYAN}create${RESET} Create a new empty brain ${CYAN}rebuild${RESET} Rebuild vector index ${CYAN}search${RESET} Test search against a brain ${CYAN}help${RESET} This message ${BOLD}EXAMPLES${RESET} ragproxy-ingest list # See all brains ragproxy-ingest list dental # Details of dental brain ragproxy-ingest create hr-policies # New brain ragproxy-ingest add dental ~/docs/policies/ # Ingest a directory ragproxy-ingest add toad ./new-chapter.md # Ingest a single file ragproxy-ingest search dental "needlestick" # Test search quality ragproxy-ingest remove dental old-policy.md # Remove a document ${BOLD}SUPPORTED FILE TYPES${RESET} .md .txt .html .pl .pm .py .sh .conf .json .yaml .yml .xml .csv ${BOLD}ENVIRONMENT${RESET} RAGPROXY_HOME Project root (default: auto-detect) OLLAMA_URL Ollama server (default: http://localhost:11434) EMBED_MODEL Embedding model (default: bge-m3) HELP return 0; } # ============================================================================= # MAIN DISPATCH # ============================================================================= my $command = shift @ARGV // 'help'; $command = lc $command; my %dispatch = ( add => sub { cmd_add(@ARGV) }, remove => sub { cmd_remove(@ARGV) }, forget => sub { cmd_remove(@ARGV) }, # alias list => sub { cmd_list(@ARGV) }, ls => sub { cmd_list(@ARGV) }, # alias create => sub { cmd_create(@ARGV) }, new => sub { cmd_create(@ARGV) }, # alias rebuild => sub { cmd_rebuild(@ARGV) }, search => sub { cmd_search(@ARGV) }, help => \&cmd_help, '--help' => \&cmd_help, '-h' => \&cmd_help, ); if (exists $dispatch{$command}) { exit($dispatch{$command}->() // 0); } else { err("Unknown command: $command"); print "Run 'ragproxy-ingest help' for usage.\n"; exit 1; } __END__