Skip to content

Commit 6148b89

Browse files
authored
feat: add Ollama as runtime backend for Metal agent (#258)
Signed-off-by: Christopher Maher <[email protected]>
1 parent eaf9045 commit 6148b89

File tree

4 files changed

+491
-13
lines changed

4 files changed

+491
-13
lines changed

cmd/metal-agent/main.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ type AgentConfig struct {
5151
Runtime string
5252
OMLXBin string
5353
OMLXPort int
54+
OllamaPort int
5455
Port int
5556
LogLevel string
5657
HostIP string
@@ -142,9 +143,10 @@ func main() {
142143
flag.StringVar(&cfg.Namespace, "namespace", "default", "Kubernetes namespace to watch")
143144
flag.StringVar(&cfg.ModelStorePath, "model-store", "/tmp/llmkube-models", "Path to store downloaded models")
144145
flag.StringVar(&llamaServerFlag, "llama-server", "", "Path to llama-server binary (auto-detected if not set)")
145-
flag.StringVar(&cfg.Runtime, "runtime", "llama-server", "Inference runtime: llama-server or omlx")
146+
flag.StringVar(&cfg.Runtime, "runtime", "llama-server", "Inference runtime: llama-server, omlx, or ollama")
146147
flag.StringVar(&cfg.OMLXBin, "omlx-bin", "", "Path to omlx binary (auto-detected if not set)")
147148
flag.IntVar(&cfg.OMLXPort, "omlx-port", 8000, "Port for oMLX server")
149+
flag.IntVar(&cfg.OllamaPort, "ollama-port", 11434, "Port for Ollama server")
148150
flag.IntVar(&cfg.Port, "port", 9090, "Agent metrics/health port")
149151
flag.StringVar(&cfg.LogLevel, "log-level", "info", "Log level (debug, info, warn, error)")
150152
flag.StringVar(&cfg.HostIP, "host-ip", "", "IP address to register in Kubernetes endpoints (auto-detected if empty)")
@@ -194,6 +196,10 @@ func main() {
194196
os.Exit(1)
195197
}
196198
cfg.OMLXBin = resolvedBin
199+
case "ollama":
200+
// Ollama manages itself — no binary resolution needed.
201+
// The agent will check if Ollama is running at startup via health check.
202+
logger.Infow("using Ollama runtime", "port", cfg.OllamaPort)
197203
default:
198204
cfg.Runtime = "llama-server"
199205
resolvedBin, err := resolveLlamaServerBin(llamaServerFlag)
@@ -247,6 +253,8 @@ func main() {
247253
switch cfg.Runtime {
248254
case "omlx":
249255
logger.Infow("omlx binary found", "path", cfg.OMLXBin)
256+
case "ollama":
257+
logger.Infow("using Ollama daemon", "port", cfg.OllamaPort)
250258
default:
251259
logger.Infow("llama-server binary found", "path", cfg.LlamaServerBin)
252260
}
@@ -282,6 +290,7 @@ func main() {
282290
Runtime: cfg.Runtime,
283291
OMLXBin: cfg.OMLXBin,
284292
OMLXPort: cfg.OMLXPort,
293+
OllamaPort: cfg.OllamaPort,
285294
Port: cfg.Port,
286295
HostIP: cfg.HostIP,
287296
Logger: logger,

deployment/macos/README.md

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,14 +462,65 @@ llmkube-metal-agent \
462462
--omlx-bin /path/to/omlx # Auto-detected from Homebrew if not set
463463
```
464464

465+
## Ollama Runtime
466+
467+
The Metal Agent also supports [Ollama](https://ollama.com) as a runtime backend. Since Ollama 0.19 uses MLX natively on Apple Silicon, this gives you fast inference with the tool most Mac users already have installed.
468+
469+
### Prerequisites
470+
471+
Install Ollama if you haven't already:
472+
473+
```bash
474+
brew install ollama
475+
```
476+
477+
### Usage
478+
479+
Start Ollama (if not already running as a menu bar app):
480+
481+
```bash
482+
ollama serve
483+
```
484+
485+
Start the Metal Agent with the Ollama runtime:
486+
487+
```bash
488+
llmkube-metal-agent --runtime ollama
489+
```
490+
491+
Deploy a model. The agent will pull the model through Ollama automatically:
492+
493+
```bash
494+
llmkube deploy llama-3.2-3b --gpu --accelerator metal
495+
```
496+
497+
The agent maps LLMKube catalog names to Ollama model tags (e.g., `llama-3.2-3b` becomes `llama3.2:3b`). If the model isn't already downloaded, Ollama pulls it from the Ollama registry.
498+
499+
### Differences from llama-server and oMLX
500+
501+
| | llama-server | oMLX | Ollama |
502+
|---|---|---|---|
503+
| Model format | GGUF | MLX | GGUF (via Ollama registry) |
504+
| Model download | Manual / init container | Manual | Automatic (`/api/pull`) |
505+
| Install base | llama.cpp users | Small | Most Mac users |
506+
| CRD changes needed | None | MLX format | None |
507+
508+
### Ollama Flags
509+
510+
```bash
511+
llmkube-metal-agent \
512+
--runtime ollama \
513+
--ollama-port 11434 # Ollama server port (default: 11434)
514+
```
515+
465516
## Performance
466517

467518
Expected performance on M4 Max (32 GPU cores):
468-
- **Llama 3.2 3B**: 80-120 tok/s (llama-server), ~115 tok/s (oMLX)
519+
- **Llama 3.2 3B**: 80-120 tok/s (llama-server), ~115 tok/s (oMLX/Ollama MLX)
469520
- **Llama 3.1 8B**: 40-60 tok/s (llama-server)
470521
- **Mistral 7B**: 45-65 tok/s (llama-server)
471522

472-
oMLX uses Apple's MLX framework which is optimized for Apple Silicon unified memory.
523+
oMLX and Ollama (0.19+) both use Apple's MLX framework for Apple Silicon inference.
473524

474525
## Security
475526

pkg/agent/agent.go

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,14 @@ type MetalAgentConfig struct {
4242
HostIP string // explicit IP to register in K8s endpoints; empty = auto-detect
4343
Logger *zap.SugaredLogger
4444

45-
// Runtime selects the inference backend: "llama-server" (default) or "omlx".
45+
// Runtime selects the inference backend: "llama-server" (default), "omlx", or "ollama".
4646
Runtime string
4747
// OMLXBin is the path to the omlx binary. Only used when Runtime is "omlx".
4848
OMLXBin string
4949
// OMLXPort is the port the shared oMLX daemon listens on (default 8000).
5050
OMLXPort int
51+
// OllamaPort is the port the Ollama daemon listens on (default 11434).
52+
OllamaPort int
5153

5254
// MemoryProvider supplies system memory info. Nil defaults to DarwinMemoryProvider.
5355
MemoryProvider MemoryProvider
@@ -72,14 +74,14 @@ type MetalAgent struct {
7274
memoryFraction float64
7375
}
7476

75-
// ManagedProcess represents a running inference process (llama-server or oMLX model).
77+
// ManagedProcess represents a running inference process (llama-server, oMLX, or Ollama model).
7678
type ManagedProcess struct {
7779
Name string
7880
Namespace string
7981
PID int
8082
Port int
8183
ModelPath string
82-
ModelID string // oMLX model identifier used for unload; empty for llama-server
84+
ModelID string // oMLX/Ollama model identifier used for unload; empty for llama-server
8385
StartedAt time.Time
8486
Healthy bool
8587
}
@@ -148,6 +150,15 @@ func (a *MetalAgent) Start(ctx context.Context) error {
148150
port,
149151
a.logger.With("subsystem", "executor"),
150152
)
153+
case "ollama":
154+
port := a.config.OllamaPort
155+
if port == 0 {
156+
port = 11434
157+
}
158+
a.executor = NewOllamaExecutor(
159+
port,
160+
a.logger.With("subsystem", "executor"),
161+
)
151162
default:
152163
a.executor = NewMetalExecutor(
153164
a.config.LlamaServerBin,
@@ -294,6 +305,14 @@ func (a *MetalAgent) ensureProcess(ctx context.Context, isvc *inferencev1alpha1.
294305
"model", model.Name, "format", modelFormat, "runtime", a.config.Runtime)
295306
return fmt.Errorf("model %s has format %q which is incompatible with omlx runtime", model.Name, modelFormat)
296307
}
308+
case "ollama":
309+
if modelFormat == "mlx" {
310+
a.logger.Warnw("skipping MLX model on Ollama runtime",
311+
"model", model.Name, "format", modelFormat, "runtime", a.config.Runtime)
312+
return fmt.Errorf(
313+
"model %s has format %q which is incompatible with ollama runtime",
314+
model.Name, modelFormat)
315+
}
297316
default:
298317
if modelFormat == "mlx" {
299318
a.logger.Warnw("skipping MLX model on llama-server runtime",
@@ -443,10 +462,17 @@ func (a *MetalAgent) deleteProcess(ctx context.Context, key string) error {
443462

444463
var deleteErrors []error
445464

446-
// For oMLX, unload the specific model instead of killing the shared daemon.
447-
if omlx, ok := a.executor.(*OMLXExecutor); ok && process.ModelID != "" {
465+
// For shared-daemon runtimes (oMLX, Ollama), unload the specific model
466+
// instead of killing the shared daemon.
467+
if ollama, ok := a.executor.(*OllamaExecutor); ok && process.ModelID != "" {
468+
if err := ollama.UnloadModel(ctx, process.ModelID); err != nil {
469+
deleteErrors = append(deleteErrors,
470+
fmt.Errorf("failed to unload Ollama model %s: %w", process.ModelID, err))
471+
}
472+
} else if omlx, ok := a.executor.(*OMLXExecutor); ok && process.ModelID != "" {
448473
if err := omlx.UnloadModel(ctx, process.ModelID); err != nil {
449-
deleteErrors = append(deleteErrors, fmt.Errorf("failed to unload oMLX model %s: %w", process.ModelID, err))
474+
deleteErrors = append(deleteErrors,
475+
fmt.Errorf("failed to unload oMLX model %s: %w", process.ModelID, err))
450476
}
451477
} else if err := a.executor.StopProcess(process.PID); err != nil {
452478
deleteErrors = append(deleteErrors, fmt.Errorf("failed to stop process: %w", err))
@@ -496,17 +522,26 @@ func (a *MetalAgent) Shutdown(ctx context.Context) error {
496522

497523
var shutdownErrors []error
498524

499-
// For oMLX, unload each model from the shared daemon.
525+
// For shared-daemon runtimes (oMLX, Ollama), unload each model instead of
526+
// killing the daemon.
500527
omlx, isOMLX := a.executor.(*OMLXExecutor)
528+
ollama, isOllama := a.executor.(*OllamaExecutor)
501529

502530
for key, process := range a.processes {
503-
if isOMLX && process.ModelID != "" {
531+
if isOllama && process.ModelID != "" {
532+
if err := ollama.UnloadModel(ctx, process.ModelID); err != nil {
533+
shutdownErrors = append(shutdownErrors,
534+
fmt.Errorf("failed to unload Ollama model %s: %w", key, err))
535+
}
536+
} else if isOMLX && process.ModelID != "" {
504537
if err := omlx.UnloadModel(ctx, process.ModelID); err != nil {
505-
shutdownErrors = append(shutdownErrors, fmt.Errorf("failed to unload oMLX model %s: %w", key, err))
538+
shutdownErrors = append(shutdownErrors,
539+
fmt.Errorf("failed to unload oMLX model %s: %w", key, err))
506540
}
507541
} else {
508542
if err := a.executor.StopProcess(process.PID); err != nil {
509-
shutdownErrors = append(shutdownErrors, fmt.Errorf("failed to stop %s: %w", key, err))
543+
shutdownErrors = append(shutdownErrors,
544+
fmt.Errorf("failed to stop %s: %w", key, err))
510545
}
511546
}
512547
}

0 commit comments

Comments
 (0)