feat: add Ollama as runtime backend for Metal agent (#258)

Defilan · web-flow · commit 6148b8995479 · 2026-04-01T09:45:45.000-07:00
Signed-off-by: Christopher Maher &lt;chris@mahercode.io&gt;
diff --git a/cmd/metal-agent/main.go b/cmd/metal-agent/main.go
@@ -51,6 +51,7 @@ type AgentConfig struct {
 	Runtime                string
 	OMLXBin                string
 	OMLXPort               int
+	OllamaPort             int
 	Port                   int
 	LogLevel               string
 	HostIP                 string
@@ -142,9 +143,10 @@ func main() {
 	flag.StringVar(&cfg.Namespace, "namespace", "default", "Kubernetes namespace to watch")
 	flag.StringVar(&cfg.ModelStorePath, "model-store", "/tmp/llmkube-models", "Path to store downloaded models")
 	flag.StringVar(&llamaServerFlag, "llama-server", "", "Path to llama-server binary (auto-detected if not set)")
-	flag.StringVar(&cfg.Runtime, "runtime", "llama-server", "Inference runtime: llama-server or omlx")
+	flag.StringVar(&cfg.Runtime, "runtime", "llama-server", "Inference runtime: llama-server, omlx, or ollama")
 	flag.StringVar(&cfg.OMLXBin, "omlx-bin", "", "Path to omlx binary (auto-detected if not set)")
 	flag.IntVar(&cfg.OMLXPort, "omlx-port", 8000, "Port for oMLX server")
+	flag.IntVar(&cfg.OllamaPort, "ollama-port", 11434, "Port for Ollama server")
 	flag.IntVar(&cfg.Port, "port", 9090, "Agent metrics/health port")
 	flag.StringVar(&cfg.LogLevel, "log-level", "info", "Log level (debug, info, warn, error)")
 	flag.StringVar(&cfg.HostIP, "host-ip", "", "IP address to register in Kubernetes endpoints (auto-detected if empty)")
@@ -194,6 +196,10 @@ func main() {
 			os.Exit(1)
 		}
 		cfg.OMLXBin = resolvedBin
+	case "ollama":
+		// Ollama manages itself — no binary resolution needed.
+		// The agent will check if Ollama is running at startup via health check.
+		logger.Infow("using Ollama runtime", "port", cfg.OllamaPort)
 	default:
 		cfg.Runtime = "llama-server"
 		resolvedBin, err := resolveLlamaServerBin(llamaServerFlag)
@@ -247,6 +253,8 @@ func main() {
 	switch cfg.Runtime {
 	case "omlx":
 		logger.Infow("omlx binary found", "path", cfg.OMLXBin)
+	case "ollama":
+		logger.Infow("using Ollama daemon", "port", cfg.OllamaPort)
 	default:
 		logger.Infow("llama-server binary found", "path", cfg.LlamaServerBin)
 	}
@@ -282,6 +290,7 @@ func main() {
 		Runtime:        cfg.Runtime,
 		OMLXBin:        cfg.OMLXBin,
 		OMLXPort:       cfg.OMLXPort,
+		OllamaPort:     cfg.OllamaPort,
 		Port:           cfg.Port,
 		HostIP:         cfg.HostIP,
 		Logger:         logger,
diff --git a/deployment/macos/README.md b/deployment/macos/README.md
@@ -462,14 +462,65 @@ llmkube-metal-agent \
   --omlx-bin /path/to/omlx   # Auto-detected from Homebrew if not set
 ```
 
+## Ollama Runtime
+
+The Metal Agent also supports [Ollama](https://ollama.com) as a runtime backend. Since Ollama 0.19 uses MLX natively on Apple Silicon, this gives you fast inference with the tool most Mac users already have installed.
+
+### Prerequisites
+
+Install Ollama if you haven't already:
+
+```bash
+brew install ollama
+```
+
+### Usage
+
+Start Ollama (if not already running as a menu bar app):
+
+```bash
+ollama serve
+```
+
+Start the Metal Agent with the Ollama runtime:
+
+```bash
+llmkube-metal-agent --runtime ollama
+```
+
+Deploy a model. The agent will pull the model through Ollama automatically:
+
+```bash
+llmkube deploy llama-3.2-3b --gpu --accelerator metal
+```
+
+The agent maps LLMKube catalog names to Ollama model tags (e.g., `llama-3.2-3b` becomes `llama3.2:3b`). If the model isn't already downloaded, Ollama pulls it from the Ollama registry.
+
+### Differences from llama-server and oMLX
+
+| | llama-server | oMLX | Ollama |
+|---|---|---|---|
+| Model format | GGUF | MLX | GGUF (via Ollama registry) |
+| Model download | Manual / init container | Manual | Automatic (`/api/pull`) |
+| Install base | llama.cpp users | Small | Most Mac users |
+| CRD changes needed | None | MLX format | None |
+
+### Ollama Flags
+
+```bash
+llmkube-metal-agent \
+  --runtime ollama \
+  --ollama-port 11434    # Ollama server port (default: 11434)
+```
+
 ## Performance
 
 Expected performance on M4 Max (32 GPU cores):
-- **Llama 3.2 3B**: 80-120 tok/s (llama-server), ~115 tok/s (oMLX)
+- **Llama 3.2 3B**: 80-120 tok/s (llama-server), ~115 tok/s (oMLX/Ollama MLX)
 - **Llama 3.1 8B**: 40-60 tok/s (llama-server)
 - **Mistral 7B**: 45-65 tok/s (llama-server)
 
-oMLX uses Apple's MLX framework which is optimized for Apple Silicon unified memory.
+oMLX and Ollama (0.19+) both use Apple's MLX framework for Apple Silicon inference.
 
 ## Security
 
diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go
@@ -42,12 +42,14 @@ type MetalAgentConfig struct {
 	HostIP         string // explicit IP to register in K8s endpoints; empty = auto-detect
 	Logger         *zap.SugaredLogger
 
-	// Runtime selects the inference backend: "llama-server" (default) or "omlx".
+	// Runtime selects the inference backend: "llama-server" (default), "omlx", or "ollama".
 	Runtime string
 	// OMLXBin is the path to the omlx binary. Only used when Runtime is "omlx".
 	OMLXBin string
 	// OMLXPort is the port the shared oMLX daemon listens on (default 8000).
 	OMLXPort int
+	// OllamaPort is the port the Ollama daemon listens on (default 11434).
+	OllamaPort int
 
 	// MemoryProvider supplies system memory info. Nil defaults to DarwinMemoryProvider.
 	MemoryProvider MemoryProvider
@@ -72,14 +74,14 @@ type MetalAgent struct {
 	memoryFraction float64
 }
 
-// ManagedProcess represents a running inference process (llama-server or oMLX model).
+// ManagedProcess represents a running inference process (llama-server, oMLX, or Ollama model).
 type ManagedProcess struct {
 	Name      string
 	Namespace string
 	PID       int
 	Port      int
 	ModelPath string
-	ModelID   string // oMLX model identifier used for unload; empty for llama-server
+	ModelID   string // oMLX/Ollama model identifier used for unload; empty for llama-server
 	StartedAt time.Time
 	Healthy   bool
 }
@@ -148,6 +150,15 @@ func (a *MetalAgent) Start(ctx context.Context) error {
 			port,
 			a.logger.With("subsystem", "executor"),
 		)
+	case "ollama":
+		port := a.config.OllamaPort
+		if port == 0 {
+			port = 11434
+		}
+		a.executor = NewOllamaExecutor(
+			port,
+			a.logger.With("subsystem", "executor"),
+		)
 	default:
 		a.executor = NewMetalExecutor(
 			a.config.LlamaServerBin,
@@ -294,6 +305,14 @@ func (a *MetalAgent) ensureProcess(ctx context.Context, isvc *inferencev1alpha1.
 				"model", model.Name, "format", modelFormat, "runtime", a.config.Runtime)
 			return fmt.Errorf("model %s has format %q which is incompatible with omlx runtime", model.Name, modelFormat)
 		}
+	case "ollama":
+		if modelFormat == "mlx" {
+			a.logger.Warnw("skipping MLX model on Ollama runtime",
+				"model", model.Name, "format", modelFormat, "runtime", a.config.Runtime)
+			return fmt.Errorf(
+				"model %s has format %q which is incompatible with ollama runtime",
+				model.Name, modelFormat)
+		}
 	default:
 		if modelFormat == "mlx" {
 			a.logger.Warnw("skipping MLX model on llama-server runtime",
@@ -443,10 +462,17 @@ func (a *MetalAgent) deleteProcess(ctx context.Context, key string) error {
 
 	var deleteErrors []error
 
-	// For oMLX, unload the specific model instead of killing the shared daemon.
-	if omlx, ok := a.executor.(*OMLXExecutor); ok && process.ModelID != "" {
+	// For shared-daemon runtimes (oMLX, Ollama), unload the specific model
+	// instead of killing the shared daemon.
+	if ollama, ok := a.executor.(*OllamaExecutor); ok && process.ModelID != "" {
+		if err := ollama.UnloadModel(ctx, process.ModelID); err != nil {
+			deleteErrors = append(deleteErrors,
+				fmt.Errorf("failed to unload Ollama model %s: %w", process.ModelID, err))
+		}
+	} else if omlx, ok := a.executor.(*OMLXExecutor); ok && process.ModelID != "" {
 		if err := omlx.UnloadModel(ctx, process.ModelID); err != nil {
-			deleteErrors = append(deleteErrors, fmt.Errorf("failed to unload oMLX model %s: %w", process.ModelID, err))
+			deleteErrors = append(deleteErrors,
+				fmt.Errorf("failed to unload oMLX model %s: %w", process.ModelID, err))
 		}
 	} else if err := a.executor.StopProcess(process.PID); err != nil {
 		deleteErrors = append(deleteErrors, fmt.Errorf("failed to stop process: %w", err))
@@ -496,17 +522,26 @@ func (a *MetalAgent) Shutdown(ctx context.Context) error {
 
 	var shutdownErrors []error
 
-	// For oMLX, unload each model from the shared daemon.
+	// For shared-daemon runtimes (oMLX, Ollama), unload each model instead of
+	// killing the daemon.
 	omlx, isOMLX := a.executor.(*OMLXExecutor)
+	ollama, isOllama := a.executor.(*OllamaExecutor)
 
 	for key, process := range a.processes {
-		if isOMLX && process.ModelID != "" {
+		if isOllama && process.ModelID != "" {
+			if err := ollama.UnloadModel(ctx, process.ModelID); err != nil {
+				shutdownErrors = append(shutdownErrors,
+					fmt.Errorf("failed to unload Ollama model %s: %w", key, err))
+			}
+		} else if isOMLX && process.ModelID != "" {
 			if err := omlx.UnloadModel(ctx, process.ModelID); err != nil {
-				shutdownErrors = append(shutdownErrors, fmt.Errorf("failed to unload oMLX model %s: %w", key, err))
+				shutdownErrors = append(shutdownErrors,
+					fmt.Errorf("failed to unload oMLX model %s: %w", key, err))
 			}
 		} else {
 			if err := a.executor.StopProcess(process.PID); err != nil {
-				shutdownErrors = append(shutdownErrors, fmt.Errorf("failed to stop %s: %w", key, err))
+				shutdownErrors = append(shutdownErrors,
+					fmt.Errorf("failed to stop %s: %w", key, err))
 			}
 		}
 	}
diff --git a/pkg/agent/executor_ollama.go b/pkg/agent/executor_ollama.go