#!/usr/bin/env perl # ============================================================================= # ragproxy — Universal RAG Proxy # ============================================================================= # # Author: Michael Limberger # Title: Senior Web & Linux Administrator, IITS, Faculty of Dentistry, # University of Toronto # # Sits between AI clients (OpenWebUI, curl, Aider) and Ollama. # Intercepts chat requests, enriches them with context from Brain.pm # vector search, and forwards the augmented request to Ollama. # The client never knows RAG is happening — it just gets better answers. # # PROTOCOLS # OpenAI-compatible: /v1/models, /v1/chat/completions (SSE streaming) # Ollama-native: /api/tags, /api/chat, /api/generate (NDJSON streaming) # # MODEL NAME CONVENTION # "dental/ministral-3" → search dental brain, forward to ministral-3 # "perl-ops/qwen3:8b" → search perl + ops brains, forward to qwen3:8b # "ministral-3" → no RAG, pure passthrough # # BRAIN CONFIGURATION # BRAIN_= — register a brain (e.g. BRAIN_DENTAL=/path/to/brain) # Default: BRAIN_DENTAL=scripts/../data/dental-brain (relative to this script) # # OTHER ENV VARS # OLLAMA_URL — Ollama backend URL (default: http://localhost:11434) # RAGPROXY_PORT — listen port (default: 7079) # RAGPROXY_API_KEY — optional Bearer token auth (disabled when unset) # RAG_SEARCH_LIMIT — chunks to retrieve per brain (default: 5) # LLM_TIMEOUT — seconds for LLM requests (default: 120) # VISION_TIMEOUT — seconds for vision requests (default: 300) # # LAUNCH # perl ragproxy daemon # RAGPROXY_PORT=7070 perl ragproxy daemon # perl ragproxy routes # show route table # perl ragproxy get /v1/models # quick smoke test # # ============================================================================= use strict; use warnings; use FindBin; # Locate Brain.pm relative to this script: # scripts/ -> tkf-2026/ -> projects/ -> workspace root -> scripts/brain/lib/ use lib "$FindBin::Bin/../lib"; use lib "$ENV{HOME}/perl5/lib/perl5"; # CPAN modules (Mojo::SQLite, PDL, etc.) use Mojolicious::Lite -signatures; use Mojo::JSON qw(encode_json decode_json); use Mojo::Promise; use File::Basename qw(basename); use Time::HiRes; # Pre-load Brain.pm and its heavy deps (PDL, Mojo::SQLite) in the parent # so forked subprocess children inherit them already loaded. use Brain; # ============================================================================= # SECTION 1 — Configuration # ============================================================================= my $OLLAMA_URL = $ENV{OLLAMA_URL} // 'http://localhost:11434'; my $RAGPROXY_PORT = $ENV{RAGPROXY_PORT} // 7079; my $RAGPROXY_API_KEY = $ENV{RAGPROXY_API_KEY}; # undef = auth disabled my $RAG_LIMIT = int($ENV{RAG_SEARCH_LIMIT} // 5); my $LLM_TIMEOUT = int($ENV{LLM_TIMEOUT} // 120); my $VISION_TIMEOUT = int($ENV{VISION_TIMEOUT} // 300); # Allowed models: if models.conf exists, only these models are exposed. # If the file is missing or empty, all Ollama models are shown. my %ALLOWED_MODELS = _load_allowed_models(); # Brain registry: name -> directory path # Populated from BRAIN_ env vars + default dental brain my %BRAINS = _load_brain_registry(); # Per-brain system prompts: name -> prompt text # Loaded from system-prompt.txt in each brain directory my %BRAIN_PROMPTS = _load_brain_prompts(); sub _load_allowed_models { my %wl; my $conf = "$FindBin::Bin/../conf/models.conf"; return %wl unless -f $conf; # No config file = no filtering (show all) open my $fh, '<', $conf or do { warn "Cannot read $conf: $!\n"; return %wl; }; while (<$fh>) { chomp; s/^\s+//; s/\s+$//; # trim whitespace next if /^#/ || /^$/; # skip comments and blank lines $wl{$_} = 1; } close $fh; return %wl; } sub _load_brain_prompts { my %prompts; for my $name (keys %BRAINS) { my $file = "$BRAINS{$name}/system-prompt.txt"; next unless -f $file; if (open my $fh, '<', $file) { local $/; my $text = <$fh>; close $fh; chomp $text; $prompts{$name} = $text; } else { warn "Cannot read $file: $!\n"; } } return %prompts; } sub _load_brain_registry { my %r; # Discover configured brains from env vars (BRAIN_DENTAL, BRAIN_PERL, etc.) for my $key (keys %ENV) { if ($key =~ /^BRAIN_([A-Z][A-Z0-9_]*)$/) { my $name = lc($1); $r{$name} = $ENV{$key}; } } # Default brains — relative to this script file unless ($r{dental}) { my $default = "$FindBin::Bin/../data/dental-brain"; $r{dental} = $default; } unless ($r{toad}) { my $default = "$FindBin::Bin/../data/toad-brain"; $r{toad} = $default; } unless ($r{perl}) { my $default = "$FindBin::Bin/../data/perl-brain"; $r{perl} = $default; } unless ($r{ruby}) { my $default = "$FindBin::Bin/../data/ruby-brain"; $r{ruby} = $default; } unless ($r{survival}) { my $default = "$FindBin::Bin/../data/survival-brain"; $r{survival} = $default; } unless ($r{usw1998}) { my $default = "$FindBin::Bin/../data/usw1998-brain"; $r{usw1998} = $default; } unless ($r{rcmd}) { my $default = "$FindBin::Bin/../data/rcmd-brain"; $r{rcmd} = $default; } unless ($r{kb_external}) { my $default = "$FindBin::Bin/../data/kb-external-brain"; $r{kb_external} = $default; } unless ($r{kb_internal}) { my $default = "$FindBin::Bin/../data/kb-internal-brain"; $r{kb_internal} = $default; } return %r; } # ============================================================================= # SECTION 2 — Application Setup # ============================================================================= # Large request size limit for vision (images can be 2-10 MB base64-encoded) app->max_request_size(100 * 1024 * 1024); # Configure the app-level UA for upstream Ollama requests app->ua->connect_timeout(10); app->ua->inactivity_timeout($LLM_TIMEOUT); app->ua->request_timeout(0); # No hard timeout — inactivity handles it app->ua->max_response_size(0); # Don't cap response size app->log->info('ragproxy starting'); app->log->info("Ollama URL : $OLLAMA_URL"); app->log->info("Port : $RAGPROXY_PORT"); app->log->info("RAG limit : $RAG_LIMIT chunks per brain"); app->log->info("Auth : " . (defined $RAGPROXY_API_KEY ? 'enabled' : 'disabled')); if (%ALLOWED_MODELS) { app->log->info("Model filter: " . join(', ', sort keys %ALLOWED_MODELS)); } else { app->log->info("Model filter: disabled (showing all Ollama models)"); } if (%BRAINS) { for my $name (sort keys %BRAINS) { my $dir = $BRAINS{$name}; my $ready = (-d $dir && -f "$dir/brain.db") ? 'READY' : 'NOT INDEXED'; app->log->info("Brain [$name]: $dir ($ready)"); } } else { app->log->warn('No brains configured — RAG disabled'); } # ============================================================================= # SECTION 3 — Authentication Hook (optional Bearer token) # ============================================================================= # # When RAGPROXY_API_KEY is set, every request must include: # Authorization: Bearer # When unset, auth is disabled (suitable for local dev/demo). # # ============================================================================= app->hook(before_dispatch => sub ($c) { return unless defined $RAGPROXY_API_KEY; my $auth = $c->req->headers->authorization // ''; if ($auth =~ /^Bearer\s+(.+)$/i && $1 eq $RAGPROXY_API_KEY) { return; # Authorized } $c->render( status => 401, json => { error => { message => 'Unauthorized', type => 'auth_error', code => 401, param => undef, } } ); }); # ============================================================================= # SECTION 4 — GET /v1/models (OpenAI model listing with RAG variants) # ============================================================================= # # Fetches the real model list from Ollama, then generates additional entries # for each configured brain, giving clients model names like: # ministral-3:14b (raw passthrough) # dental/ministral-3:14b (RAG-enhanced with dental brain) # # ============================================================================= get '/v1/models' => sub ($c) { $c->ua->get_p("$OLLAMA_URL/v1/models")->then(sub ($tx) { unless ($tx->res->is_success) { my $err = $tx->error; app->log->error("Failed to fetch /v1/models: " . ($err->{message} // 'unknown')); return _openai_error($c, 502, 'Failed to fetch model list from backend'); } my $data = $tx->res->json; my $models = $data->{data} // []; my $expanded = _expand_openai_models($models); $c->render(json => { object => 'list', data => $expanded }); })->catch(sub ($err) { app->log->error("Models endpoint error: $err"); _openai_error($c, 502, 'Backend service unavailable'); }); }; # ============================================================================= # SECTION 5 — POST /v1/chat/completions (OpenAI chat with RAG injection) # ============================================================================= # # Flow: # 1. Parse model name → extract brain names and real Ollama model # 2. Detect vision content (images) → set extended timeout # 3. If brains requested: extract text query, search each brain in subprocess # 4. Inject retrieved chunks as system message context # 5. Forward augmented request to Ollama (streaming SSE or non-streaming) # # ============================================================================= post '/v1/chat/completions' => sub ($c) { my $body = $c->req->json; # ----------------------------------------------------------------------- # Input validation # ----------------------------------------------------------------------- unless ($body && ref $body eq 'HASH') { return _openai_error($c, 400, 'Request body must be JSON', 'invalid_request_error'); } my $model = $body->{model}; unless (defined $model && length $model) { return _openai_error($c, 400, 'Missing required field: model', 'invalid_request_error', 'model'); } my $messages = $body->{messages}; unless ($messages && ref $messages eq 'ARRAY' && @$messages) { return _openai_error($c, 400, 'Missing required field: messages', 'invalid_request_error', 'messages'); } # ----------------------------------------------------------------------- # Parse model name and set up routing # ----------------------------------------------------------------------- my $streaming = $body->{stream} // \0; # OpenAI default: non-streaming my $is_streaming = $streaming && $streaming ne '0' && $streaming ne 'false'; my ($brain_names, $base_model) = _parse_model_name($model); $body->{model} = $base_model; $body->{keep_alive} = -1; # Keep models loaded permanently my $has_vision = _has_vision_content($messages); my $timeout = $has_vision ? $VISION_TIMEOUT : $LLM_TIMEOUT; app->log->info(sprintf( 'OAI chat: model=%s base=%s brains=[%s] stream=%s vision=%s', $model, $base_model, join(',', @$brain_names) || 'none', $is_streaming ? 'yes' : 'no', $has_vision ? 'yes' : 'no', )); # ----------------------------------------------------------------------- # RAG retrieval or direct passthrough # ----------------------------------------------------------------------- $c->render_later; # We will render asynchronously if (@$brain_names) { my $query = _extract_query_text($messages); unless (defined $query && length $query) { app->log->warn('No text query for RAG — forwarding without context'); _forward_openai($c, $body, $is_streaming, $timeout); return; } app->log->debug("RAG query: $query"); _rag_search_all($brain_names, $query, sub ($chunks) { if ($chunks && @$chunks) { _inject_rag_context($messages, $chunks, $brain_names); app->log->info("Injected " . scalar(@$chunks) . " RAG chunks into context"); } else { app->log->warn('RAG returned no chunks — forwarding without context'); } _forward_openai($c, $body, $is_streaming, $timeout); }); } else { # No RAG prefix — pure passthrough _forward_openai($c, $body, $is_streaming, $timeout); } }; # ============================================================================= # SECTION 6 — GET /api/tags (Ollama native model listing with RAG variants) # ============================================================================= get '/api/tags' => sub ($c) { $c->ua->get_p("$OLLAMA_URL/api/tags")->then(sub ($tx) { unless ($tx->res->is_success) { my $err = $tx->error; app->log->error("Failed to fetch /api/tags: " . ($err->{message} // 'unknown')); return $c->render(status => 502, json => { error => 'Failed to fetch models from backend' }); } my $data = $tx->res->json; my $models = $data->{models} // []; my $expanded = _expand_ollama_models($models); $c->render(json => { models => $expanded }); })->catch(sub ($err) { app->log->error("Tags endpoint error: $err"); $c->render(status => 502, json => { error => 'Backend service unavailable' }); }); }; # ============================================================================= # SECTION 7 — POST /api/chat (Ollama native chat with RAG injection) # ============================================================================= # # Same RAG logic as /v1/chat/completions but: # - Accepts Ollama's native request format (options, keep_alive, etc.) # - Streaming via NDJSON (default: true, opposite of OpenAI) # - Errors use Ollama's native format: {"error": "message"} # # ============================================================================= post '/api/chat' => sub ($c) { my $body = $c->req->json; unless ($body && ref $body eq 'HASH') { return $c->render(status => 400, json => { error => 'Request body must be JSON' }); } my $model = $body->{model}; unless (defined $model && length $model) { return $c->render(status => 400, json => { error => 'Missing required field: model' }); } my $messages = $body->{messages}; unless ($messages && ref $messages eq 'ARRAY' && @$messages) { return $c->render(status => 400, json => { error => 'Missing required field: messages' }); } # Ollama defaults to streaming = true (opposite of OpenAI) my $is_streaming = exists $body->{stream} ? $body->{stream} : 1; my ($brain_names, $base_model) = _parse_model_name($model); $body->{model} = $base_model; $body->{keep_alive} = -1; # Keep models loaded permanently my $has_vision = _has_vision_content($messages); my $timeout = $has_vision ? $VISION_TIMEOUT : $LLM_TIMEOUT; app->log->info(sprintf( 'Ollama chat: model=%s base=%s brains=[%s] stream=%s vision=%s', $model, $base_model, join(',', @$brain_names) || 'none', $is_streaming ? 'yes' : 'no', $has_vision ? 'yes' : 'no', )); $c->render_later; my $msg_count = scalar @$messages; my $total_chars = 0; $total_chars += length($_->{content} // '') for @$messages; app->log->info("Request: $msg_count messages, $total_chars total chars, timeout=${timeout}s"); # Detect OpenWebUI title generation requests — skip RAG (they don't need it) my $last_content = $messages->[-1]{content} // ''; if ($last_content =~ /^###\s*Task:|^Create a concise|^Generate a.*title/i) { app->log->info("Title generation detected — skipping RAG, passthrough"); _forward_ollama($c, $body, $is_streaming, $timeout); return; } if (@$brain_names) { my $query = _extract_query_text($messages); unless (defined $query && length $query) { app->log->warn('No text query for RAG (native) — forwarding without context'); _forward_ollama($c, $body, $is_streaming, $timeout); return; } app->log->info("RAG search starting for query: " . substr($query, 0, 80) . "..."); my $rag_start = Time::HiRes::time(); _rag_search_all($brain_names, $query, sub ($chunks) { my $rag_elapsed = sprintf("%.2f", Time::HiRes::time() - $rag_start); app->log->info("RAG search done in ${rag_elapsed}s — " . scalar(@$chunks) . " chunks found"); if ($chunks && @$chunks) { _inject_rag_context($messages, $chunks, $brain_names); my $ctx_chars = 0; $ctx_chars += length($_->{content} // '') for @$messages; app->log->info("After injection: $ctx_chars total chars in messages"); } app->log->info("Forwarding to Ollama (stream=$is_streaming, timeout=${timeout}s)"); _forward_ollama($c, $body, $is_streaming, $timeout); }); } else { app->log->info("No brains — passthrough to Ollama"); _forward_ollama($c, $body, $is_streaming, $timeout); } }; # ============================================================================= # SECTION 8 — Catch-all passthrough # ============================================================================= # # Forwards safe Ollama API requests not handled by explicit routes above. # SSRF prevention: allowlist of safe path patterns. # Destructive methods blocked. # # ============================================================================= my @PASSTHROUGH_ALLOWLIST = ( qr{^/api/show$}, # Model info qr{^/api/ps$}, # Running models qr{^/api/version$}, # Version check qr{^/api/generate$}, # Single-turn generate (vision passthrough) qr{^/$}, # Root health check ); my @HOP_BY_HOP = qw( host connection transfer-encoding te trailer upgrade proxy-authorization proxy-connection keep-alive ); any '/*proxy_path' => { proxy_path => '' } => sub ($c) { my $method = $c->req->method; my $path = $c->req->url->path->to_string; # Check SSRF allowlist my $allowed = grep { $path =~ $_ } @PASSTHROUGH_ALLOWLIST; unless ($allowed) { app->log->warn("Blocked passthrough to restricted path: $method $path"); return _openai_error($c, 403, 'Endpoint not available through proxy'); } # Block destructive methods unless ($method =~ /^(GET|HEAD|POST|OPTIONS)$/) { app->log->warn("Blocked destructive method on passthrough: $method $path"); return _openai_error($c, 405, 'Method not allowed'); } app->log->debug("Passthrough: $method $path"); # Build upstream request with sanitized headers my $url = $OLLAMA_URL . $c->req->url->path_query; my $clean_headers = _strip_hop_by_hop($c->req->headers); my $tx; if (my $body = $c->req->body) { $tx = $c->ua->build_tx($method => $url => $clean_headers => $body); } else { $tx = $c->ua->build_tx($method => $url => $clean_headers); } $c->proxy->start_p($tx)->catch(sub ($err) { app->log->error("Passthrough error ($method $path): $err"); _openai_error($c, 502, 'Backend service unavailable'); }); }; # ============================================================================= # SECTION 9 — Forwarding to Ollama # ============================================================================= # --------------------------------------------------------------------------- # _forward_openai($c, $body, $is_streaming, $timeout) # # Forward to Ollama's /v1/chat/completions endpoint. # Streaming: SSE format "data: {json}\n\n" ... "data: [DONE]\n\n" # Non-streaming: uses Mojo's built-in proxy helper. # --------------------------------------------------------------------------- sub _forward_openai { my ($c, $body, $is_streaming, $timeout) = @_; $timeout //= $LLM_TIMEOUT; my $url = "$OLLAMA_URL/v1/chat/completions"; if ($is_streaming) { my $tx = $c->ua->build_tx( POST => $url => { 'Content-Type' => 'application/json' } => json => $body ); # Set per-connection inactivity timeout (vision needs 300s) $c->inactivity_timeout($timeout); # Send SSE headers immediately — starts the response $c->res->headers->content_type('text/event-stream'); $c->res->headers->header('Cache-Control' => 'no-cache'); $c->res->headers->header('X-Accel-Buffering' => 'no'); $c->write; # Finalize headers, begin streaming body # CRITICAL: Unsubscribe the default accumulating read handler. # Attach our own handler that forwards each chunk immediately. # This is the zero-buffering pattern from the reference proxy. # eval{} guards against "Transaction already destroyed" when client disconnects. $tx->res->content->unsubscribe('read')->on(read => sub { my ($content, $bytes) = @_; eval { $c->write($bytes) }; }); $tx->res->on(error => sub { my ($res, $err) = @_; app->log->error("SSE upstream error: $err"); eval { my $edata = encode_json({ error => { message => 'Backend streaming error', type => 'server_error' } }); $c->write("data: $edata\n\ndata: [DONE]\n\n"); }; $c->finish; }); $c->ua->start($tx => sub { my ($ua, $tx) = @_; if (my $err = $tx->error) { unless ($err->{code}) { app->log->error("Ollama connection failed (OAI stream): $err->{message}"); eval { my $edata = encode_json({ error => { message => 'Backend unavailable', type => 'server_error' } }); $c->write("data: $edata\n\ndata: [DONE]\n\n"); }; } } $c->finish; }); $c->on(finish => sub { app->log->debug('SSE client disconnected') }); } else { # Non-streaming: must set inactivity timeout (default 30s is too short) $c->inactivity_timeout($timeout); my $tx = $c->ua->build_tx( POST => $url => { 'Content-Type' => 'application/json' } => json => $body ); $c->proxy->start_p($tx)->catch(sub ($err) { app->log->error("Ollama non-streaming error (OAI): $err"); _openai_error($c, 502, 'Backend service unavailable'); }); } } # --------------------------------------------------------------------------- # _forward_ollama($c, $body, $is_streaming, $timeout) # # Forward to Ollama's /api/chat endpoint (native protocol). # Streaming: NDJSON — one JSON object per line, ends with "done":true chunk. # --------------------------------------------------------------------------- sub _forward_ollama { my ($c, $body, $is_streaming, $timeout) = @_; $timeout //= $LLM_TIMEOUT; my $url = "$OLLAMA_URL/api/chat"; if ($is_streaming) { my $tx = $c->ua->build_tx( POST => $url => { 'Content-Type' => 'application/json' } => json => $body ); $c->inactivity_timeout($timeout); # NDJSON content type $c->res->headers->content_type('application/x-ndjson'); $c->write; # Zero-buffering: pipe each NDJSON chunk through immediately # eval{} guards against "Transaction already destroyed" when client disconnects. $tx->res->content->unsubscribe('read')->on(read => sub { my ($content, $bytes) = @_; eval { $c->write($bytes) }; }); $tx->res->on(error => sub { my ($res, $err) = @_; app->log->error("NDJSON upstream error: $err"); eval { $c->write(encode_json({ error => 'Backend streaming error' }) . "\n") }; $c->finish; }); $c->ua->start($tx => sub { my ($ua, $tx) = @_; if (my $err = $tx->error) { unless ($err->{code}) { app->log->error("Ollama connection failed (native stream): $err->{message}"); eval { $c->write(encode_json({ error => 'Backend unavailable' }) . "\n") }; } } $c->finish; }); $c->on(finish => sub { app->log->debug('NDJSON client disconnected') }); } else { # Non-streaming: must set inactivity timeout (default 30s is too short) $c->inactivity_timeout($timeout); my $tx = $c->ua->build_tx( POST => $url => { 'Content-Type' => 'application/json' } => json => $body ); $c->proxy->start_p($tx)->catch(sub ($err) { app->log->error("Ollama non-streaming error (native): $err"); $c->render(status => 502, json => { error => 'Backend service unavailable' }); }); } } # ============================================================================= # SECTION 10 — RAG Functions # ============================================================================= # --------------------------------------------------------------------------- # _rag_search_all(\@brain_names, $query, $callback) # # Searches all named brains asynchronously (one subprocess per brain). # Calls $callback->(\@merged_chunks) when all searches complete. # Merges results from all brains, sorts by score, limits to $RAG_LIMIT. # Falls back to empty results if all searches fail (graceful degradation). # --------------------------------------------------------------------------- sub _rag_search_all { my ($brain_names, $query, $callback) = @_; # Filter to only brains with configured directories my @valid = grep { exists $BRAINS{$_} } @$brain_names; unless (@valid) { app->log->warn("None of the requested brains exist: " . join(', ', @$brain_names)); $callback->([]); return; } my @all_chunks; my $remaining = scalar @valid; my $completed = 0; for my $name (@valid) { _search_brain_async($name, $query, sub ($chunks) { push @all_chunks, @{$chunks // []}; $completed++; if ($completed >= $remaining) { # All brains done — sort by score and limit my @sorted = sort { ($b->{score} // 0) <=> ($a->{score} // 0) } @all_chunks; if (@sorted > $RAG_LIMIT) { @sorted = @sorted[0 .. ($RAG_LIMIT - 1)]; } $callback->(\@sorted); } }); } } # --------------------------------------------------------------------------- # _search_brain_async($name, $query, $callback) # # Runs a Brain.pm search in a forked subprocess (non-blocking). # The subprocess isolates Brain.pm's synchronous Ollama embedding call # from the Mojolicious event loop. # Calls $callback->(\@chunks) when done. Empty arrayref on failure. # # SUBPROCESS SAFETY # Mojo::IOLoop::Subprocess calls Mojo::IOLoop->reset in the child, # clearing the inherited event loop state. This makes synchronous # Mojo::UserAgent calls safe in the child process. # --------------------------------------------------------------------------- sub _search_brain_async { my ($name, $query, $callback) = @_; my $brain_dir = $BRAINS{$name}; my $ollama_url = $OLLAMA_URL; my $limit = $RAG_LIMIT; # Check if brain DB exists before forking unless (-d $brain_dir && -f "$brain_dir/brain.db") { app->log->warn("Brain '$name' not ready at $brain_dir — skipping"); $callback->([]); return; } # Capture @INC for restoration in child (fork inherits it, but explicit is safer) my @inc_paths = @INC; Mojo::IOLoop->subprocess->run_p(sub { # ---- CHILD PROCESS ---- # Brain.pm is already loaded (use Brain; above), %INC shows it. # Create a fresh instance — new UA, new SQLite connection. @INC = @inc_paths; my $brain = Brain->new( brain_dir => $brain_dir, ollama_url => $ollama_url, model => 'bge-m3', ); my $results = eval { $brain->search($query, $limit) }; if ($@) { # Can't warn to parent's logger — just return empty return []; } return $results // []; })->then(sub ($results) { $callback->($results // []); })->catch(sub ($err) { app->log->error("Brain '$name' subprocess failed: $err"); $callback->([]); }); } # --------------------------------------------------------------------------- # _inject_rag_context(\@messages, \@chunks) # # Injects retrieved chunks into the messages array as a system message. # Prepends a new system message if none exists; appends to existing one. # Each chunk is formatted with its source filename and section heading. # # SECURITY: strips prompt injection attempts from chunk content before # injecting (removes common injection markers). # --------------------------------------------------------------------------- sub _inject_rag_context { my ($messages, $chunks, $brain_names) = @_; return unless $chunks && @$chunks; my @doc_blocks; for my $chunk (@$chunks) { my $source = $chunk->{source} // 'unknown'; my $section = $chunk->{section} // ''; my $content = $chunk->{content} // ''; my $score = $chunk->{score} // '0'; # Use basename for cleaner source display $source = basename($source) if $source !~ /^memory:/; # SECURITY: sanitize for prompt injection # Remove known injection markers but preserve document content $content =~ s{\[(?:INST|SYS)\]|\[/(?:INST|SYS)\]}{}gi; $content =~ s{<\|(?:im_start|im_end|system|user|assistant)\|>}{}gi; $content =~ s{###\s*(?:System|Assistant|Human|User)\s*:}{[redacted]:}gi; my $header = "--- [Source: $source" . ($section ? " / $section" : '') . "] ---"; push @doc_blocks, "$header\n$content"; } my $context_block = join("\n\n", @doc_blocks); # Build system prompt from per-brain system-prompt.txt files. # If multiple brains are queried, combine their prompts. # Falls back to a generic prompt if no system-prompt.txt exists. my $identity = ''; if ($brain_names && @$brain_names) { my @prompts; for my $name (@$brain_names) { if (exists $BRAIN_PROMPTS{$name}) { push @prompts, $BRAIN_PROMPTS{$name}; } } $identity = join("\n\n", @prompts) if @prompts; } # Fallback if no per-brain prompt found unless ($identity) { $identity = "You are a knowledgeable assistant. Answer questions using ONLY the retrieved documents below. If the answer is not in the documents, say so."; } my $rag_system = "$identity\n\nRETRIEVED DOCUMENTS:\n$context_block"; # Inject: append to existing system message or prepend new one if (@$messages && $messages->[0]{role} eq 'system') { $messages->[0]{content} .= "\n\n$rag_system"; } else { unshift @$messages, { role => 'system', content => $rag_system, }; } } # ============================================================================= # SECTION 11 — Helper Functions # ============================================================================= # --------------------------------------------------------------------------- # _parse_model_name($model_string) # # Splits "dental/ministral-3" into (["dental"], "ministral-3"). # Splits "perl-ops/qwen3:8b" into (["perl", "ops"], "qwen3:8b"). # Each hyphen-separated part of the prefix is a brain name. # Unknown brain names are silently dropped (graceful degradation). # # Returns (\@brain_names, $base_model_string) # --------------------------------------------------------------------------- sub _parse_model_name { my ($model) = @_; # Split on LAST '/' — everything before is brain prefix, after is model if ($model =~ m{^(.+)/([^/]+)$}) { my ($prefix, $base) = ($1, $2); # Split prefix on '-' to get individual brain names my @parts = split /-/, $prefix; my @brains = grep { exists $BRAINS{$_} } @parts; # Warn if some parts were unknown my @unknown = grep { !exists $BRAINS{$_} } @parts; if (@unknown) { app->log->warn("Unknown brain name(s) in model prefix: " . join(', ', @unknown)); } return (\@brains, $base); } # No slash — plain model name, no RAG return ([], $model); } # --------------------------------------------------------------------------- # _has_vision_content(\@messages) # # Returns 1 if any message contains image data. # # Ollama format: messages[n].images = [ "base64...", ... ] # OpenAI format: messages[n].content = [ { type: "image_url", ... }, ... ] # --------------------------------------------------------------------------- sub _has_vision_content { my ($messages) = @_; for my $msg (@$messages) { # Ollama native: images array on message if (ref $msg->{images} eq 'ARRAY' && @{$msg->{images}}) { return 1; } # OpenAI: content is array with image_url items if (ref $msg->{content} eq 'ARRAY') { for my $part (@{$msg->{content}}) { return 1 if ref $part eq 'HASH' && ($part->{type} // '') eq 'image_url'; } } } return 0; } # --------------------------------------------------------------------------- # _extract_query_text(\@messages) # # Finds the text content of the last user message. # Handles both string content and array-of-parts content (multimodal). # Returns undef if no user message found. # --------------------------------------------------------------------------- sub _extract_query_text { my ($messages) = @_; for my $msg (reverse @$messages) { next unless ($msg->{role} // '') eq 'user'; my $content = $msg->{content}; if (ref $content eq 'ARRAY') { # Multimodal: extract text parts, ignore image parts my @text_parts = map { $_->{text} // '' } grep { ($_->{type} // '') eq 'text' } @$content; my $text = join(' ', @text_parts); return $text if length $text; } elsif (defined $content && length $content) { return $content; } } return undef; } # --------------------------------------------------------------------------- # _expand_openai_models(\@ollama_models) # # Takes the 'data' array from Ollama's /v1/models and adds RAG variants. # For each real model, generates one entry per configured brain: # "dental/ministral-3:14b" # --------------------------------------------------------------------------- sub _expand_openai_models { my ($models) = @_; # Apply allow-list filter if models.conf exists if (%ALLOWED_MODELS) { $models = [ grep { $ALLOWED_MODELS{ $_->{id} // '' } } @$models ]; } my @out = @$models; for my $m (@$models) { my $id = $m->{id} or next; my $created = $m->{created} // time(); my $owned = $m->{owned_by} // 'library'; for my $brain (sort keys %BRAINS) { push @out, { id => "$brain/$id", object => 'model', created => $created, owned_by => $owned, }; } } return \@out; } # --------------------------------------------------------------------------- # _expand_ollama_models(\@ollama_models) # # Takes the 'models' array from Ollama's /api/tags and adds RAG variants. # --------------------------------------------------------------------------- sub _expand_ollama_models { my ($models) = @_; # Apply allow-list filter if models.conf exists if (%ALLOWED_MODELS) { $models = [ grep { $ALLOWED_MODELS{ $_->{name} // '' } } @$models ]; } my @out = @$models; for my $m (@$models) { my $name = $m->{name} or next; for my $brain (sort keys %BRAINS) { my %variant = %$m; $variant{name} = "$brain/$name"; $variant{model} = "$brain/$name"; push @out, \%variant; } } return \@out; } # --------------------------------------------------------------------------- # _openai_error($c, $status, $message, $type, $param) # # Render an OpenAI-format error response. # --------------------------------------------------------------------------- sub _openai_error { my ($c, $status, $message, $type, $param) = @_; $type //= 'api_error'; $c->render( status => $status, json => { error => { message => $message, type => $type, code => $status, param => $param, } } ); } # --------------------------------------------------------------------------- # _strip_hop_by_hop($headers) # # Returns a hashref of headers safe to forward through a proxy. # Strips hop-by-hop headers (RFC 2616 §13.5.1) to prevent request # smuggling and protocol confusion. # --------------------------------------------------------------------------- sub _strip_hop_by_hop { my ($headers) = @_; my %hop = map { $_ => 1 } @HOP_BY_HOP; my $all = $headers->to_hash; my %safe; for my $name (keys %$all) { $safe{$name} = $all->{$name} unless $hop{lc($name)}; } return \%safe; } # ============================================================================= # SECTION 12 — Start # ============================================================================= # # Default listen address for daemon mode. The user can override: # perl ragproxy daemon (uses RAGPROXY_PORT) # perl ragproxy daemon -l http://*:4000 (explicit override) # perl ragproxy routes (show route table) # perl ragproxy get /v1/models (smoke test) # # ============================================================================= if (grep { $_ eq 'daemon' } @ARGV and !grep { $_ eq '-l' || $_ eq '--listen' } @ARGV) { push @ARGV, '-l', "http://*:$RAGPROXY_PORT"; } app->start; __END__ =head1 NAME ragproxy - Universal RAG Proxy for TKF-2026 =head1 SYNOPSIS # Start on default port 7079 perl ragproxy daemon # Custom port RAGPROXY_PORT=7070 perl ragproxy daemon # With dental brain BRAIN_DENTAL=/path/to/dental-brain perl ragproxy daemon # Show routes perl ragproxy routes # Quick smoke test perl ragproxy get /v1/models =head1 DESCRIPTION A Mojolicious::Lite reverse proxy that speaks both the OpenAI API and the native Ollama API. Clients connect to it like a normal LLM endpoint. Under the hood, it intercepts the question, searches Brain.pm vector databases for relevant context, injects that context into the prompt, and forwards to the real Ollama model. Use model names like C to activate RAG for the C brain database. Plain model names like C are passed through to Ollama without any RAG augmentation. =head1 SEE ALSO L - The RAG engine L - Web framework L - Project repository =cut