dusty-nv · dusty-nv · Apr 5, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/packages/llm/ollama/Dockerfile b/packages/llm/ollama/Dockerfile
@@ -0,0 +1,61 @@
+#---
+# name: ollama 
+# group: llm
+# config: config.py
+# depends: [build-essential, cuda]
+# requires: '>=34.1.0'
+# docs: docs.md
+#---
+ARG BASE_IMAGE
+ARG CMAKE_CUDA_ARCHITECTURES
+ARG JETPACK_VERSION
+ARG OLLAMA_REPO
+ARG OLLAMA_BRANCH
+ARG GOLANG_VERSION
+ARG CMAKE_VERSION
+
+FROM ${BASE_IMAGE} AS ollama-l4t-build
+
+ARG OLLAMA_REPO
+ARG OLLAMA_BRANCH
+ARG GOLANG_VERSION
+ARG CMAKE_VERSION
+ARG CMAKE_CUDA_ARCHITECTURES
+
+WORKDIR /opt
+
+ADD https://api.github.com/repos/${OLLAMA_REPO}/git/refs/heads/${OLLAMA_BRANCH} /tmp/ollama_version.json
+RUN git clone --branch=${OLLAMA_BRANCH} --depth=1 --recursive https://github.com/${OLLAMA_REPO}
+
+COPY ollama_deps.sh /opt/ollama_deps.sh
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /opt/ollama_deps.sh
+
+# generate llama.cpp backend to bundle with Ollama
+WORKDIR ollama/llm/generate
+ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/compat
+ENV CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}
+RUN bash gen_linux.sh
+
+WORKDIR /opt/ollama
+RUN go build -trimpath .
+
+# build the runtime container
+FROM ${BASE_IMAGE}
+COPY --from=ollama-l4t-build /opt/ollama/ollama /bin/ollama
+
+ARG JETPACK_VERSION
+
+EXPOSE 11434
+ENV OLLAMA_HOST 0.0.0.0
+ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/cuda/include
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV JETSON_JETPACK=${JETPACK_VERSION}
+
+COPY test.sh /test.sh
+COPY benchmark.py /benchmark.py
+COPY benchmark.sh /benchmark.sh
+
+ENTRYPOINT ["/bin/bash", "-c"]
+
+CMD ["/bin/ollama serve"]
diff --git a/packages/llm/ollama/README.md b/packages/llm/ollama/README.md
@@ -0,0 +1,141 @@
+# llama_cpp
+
+> [`CONTAINERS`](#user-content-containers) [`IMAGES`](#user-content-images) [`RUN`](#user-content-run) [`BUILD`](#user-content-build)
+
+
+* llama.cpp from https://github.com/ggerganov/llama.cpp with CUDA enabled (found under `/opt/llama.cpp`)
+* Python bindings from https://github.com/abetlen/llama-cpp-python (found under `/opt/llama-cpp-python`)
+
+> [!WARNING]  
+> Starting with version 0.1.79, the model format has changed from GGML to GGUF.  Existing GGML models can be converted using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp) (or you can often find the GGUF conversions on [HuggingFace Hub](https://huggingface.co/models?search=GGUF))
+
+There are two branches of this container for backwards compatability:
+
+* `llama_cpp:gguf` (the default, which tracks upstream master)
+* `llama_cpp:ggml` (which still supports GGML model format)
+
+There are a couple patches applied to the legacy GGML fork:
+
+* fixed `__fp16` typedef in llama.h on ARM64 (use `half` with NVCC)
+* parsing of BOS/EOS tokens (see https://github.com/ggerganov/llama.cpp/pull/1931)
+
+### Inference Benchmark
+
+You can use llama.cpp's built-in [`main`](https://github.com/ggerganov/llama.cpp/tree/master/examples/main) tool to run GGUF models (from [HuggingFace Hub](https://huggingface.co/models?search=gguf) or elsewhere)
+
+```bash
+./run.sh --workdir=/opt/llama.cpp/bin $(./autotag llama_cpp) /bin/bash -c \
+ './main --model $(huggingface-downloader TheBloke/Llama-2-7B-GGUF/llama-2-7b.Q4_K_S.gguf) \
+         --prompt "Once upon a time," \
+         --n-predict 128 --ctx-size 192 --batch-size 192 \
+         --n-gpu-layers 999 --threads $(nproc)'
+```
+
+> &gt; the `--model` argument expects a .gguf filename (typically the `Q4_K_S` quantization is used) <br>
+> &gt; if you're trying to load Llama-2-70B, add the `--gqa 8` flag
+
+To use the Python API and [`benchmark.py`](/packages/llm/llama_cpp/benchmark.py) instead:
+
+```bash
+./run.sh --workdir=/opt/llama.cpp/bin $(./autotag llama_cpp) /bin/bash -c \
+ 'python3 benchmark.py --model $(huggingface-downloader TheBloke/Llama-2-7B-GGUF/llama-2-7b.Q4_K_S.gguf) \
+            --prompt "Once upon a time," \
+            --n-predict 128 --ctx-size 192 --batch-size 192 \
+            --n-gpu-layers 999 --threads $(nproc)'
+```
+
+### Memory Usage
+
+| Model                                                                           |          Quantization         | Memory (MB) |
+|---------------------------------------------------------------------------------|:-----------------------------:|:-----------:|
+| [`TheBloke/Llama-2-7B-GGUF`](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)   |  `llama-2-7b.Q4_K_S.gguf`     |    5,268    |
+| [`TheBloke/Llama-2-13B-GGUF`](https://huggingface.co/TheBloke/Llama-2-13B-GGUF) | `llama-2-13b.Q4_K_S.gguf`     |    8,609    |
+| [`TheBloke/LLaMA-30b-GGUF`](https://huggingface.co/TheBloke/LLaMA-30b-GGUF)     | `llama-30b.Q4_K_S.gguf`       |    19,045   |
+| [`TheBloke/Llama-2-70B-GGUF`](https://huggingface.co/TheBloke/Llama-2-70B-GGUF) | `llama-2-70b.Q4_K_S.gguf`     |    37,655   |
+
+<details open>
+<summary><b><a id="containers">CONTAINERS</a></b></summary>
+<br>
+
+| **`llama_cpp:ggml`** | |
+| :-- | :-- |
+| &nbsp;&nbsp;&nbsp;Builds | [![`llama_cpp-ggml_jp51`](https://img.shields.io/github/actions/workflow/status/dusty-nv/jetson-containers/llama_cpp-ggml_jp51.yml?label=llama_cpp-ggml:jp51)](https://github.com/dusty-nv/jetson-containers/actions/workflows/llama_cpp-ggml_jp51.yml) [![`llama_cpp-ggml_jp60`](https://img.shields.io/github/actions/workflow/status/dusty-nv/jetson-containers/llama_cpp-ggml_jp60.yml?label=llama_cpp-ggml:jp60)](https://github.com/dusty-nv/jetson-containers/actions/workflows/llama_cpp-ggml_jp60.yml) |
+| &nbsp;&nbsp;&nbsp;Requires | `L4T >=34.1.0` |
+| &nbsp;&nbsp;&nbsp;Dependencies | [`build-essential`](/packages/build-essential) [`cuda`](/packages/cuda/cuda) [`cudnn`](/packages/cuda/cudnn) [`python`](/packages/python) [`cmake`](/packages/cmake/cmake_pip) [`numpy`](/packages/numpy) [`huggingface_hub`](/packages/llm/huggingface_hub) |
+| &nbsp;&nbsp;&nbsp;Dockerfile | [`Dockerfile`](Dockerfile) |
+| &nbsp;&nbsp;&nbsp;Images | [`dustynv/llama_cpp:ggml-r35.2.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-05, 5.2GB)`<br>[`dustynv/llama_cpp:ggml-r35.3.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-06, 5.2GB)`<br>[`dustynv/llama_cpp:ggml-r35.4.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-19, 5.2GB)`<br>[`dustynv/llama_cpp:ggml-r36.2.0`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-19, 5.1GB)` |
+
+| **`llama_cpp:gguf`** | |
+| :-- | :-- |
+| &nbsp;&nbsp;&nbsp;Aliases | `llama_cpp` |
+| &nbsp;&nbsp;&nbsp;Builds | [![`llama_cpp-gguf_jp60`](https://img.shields.io/github/actions/workflow/status/dusty-nv/jetson-containers/llama_cpp-gguf_jp60.yml?label=llama_cpp-gguf:jp60)](https://github.com/dusty-nv/jetson-containers/actions/workflows/llama_cpp-gguf_jp60.yml) [![`llama_cpp-gguf_jp51`](https://img.shields.io/github/actions/workflow/status/dusty-nv/jetson-containers/llama_cpp-gguf_jp51.yml?label=llama_cpp-gguf:jp51)](https://github.com/dusty-nv/jetson-containers/actions/workflows/llama_cpp-gguf_jp51.yml) |
+| &nbsp;&nbsp;&nbsp;Requires | `L4T >=34.1.0` |
+| &nbsp;&nbsp;&nbsp;Dependencies | [`build-essential`](/packages/build-essential) [`cuda`](/packages/cuda/cuda) [`cudnn`](/packages/cuda/cudnn) [`python`](/packages/python) [`cmake`](/packages/cmake/cmake_pip) [`numpy`](/packages/numpy) [`huggingface_hub`](/packages/llm/huggingface_hub) |
+| &nbsp;&nbsp;&nbsp;Dependants | [`l4t-text-generation`](/packages/l4t/l4t-text-generation) [`langchain`](/packages/llm/langchain) [`langchain:samples`](/packages/llm/langchain) [`text-generation-webui:1.7`](/packages/llm/text-generation-webui) [`text-generation-webui:6a7cd01`](/packages/llm/text-generation-webui) [`text-generation-webui:main`](/packages/llm/text-generation-webui) |
+| &nbsp;&nbsp;&nbsp;Dockerfile | [`Dockerfile`](Dockerfile) |
+| &nbsp;&nbsp;&nbsp;Images | [`dustynv/llama_cpp:gguf-r35.2.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-15, 5.1GB)`<br>[`dustynv/llama_cpp:gguf-r35.3.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-19, 5.2GB)`<br>[`dustynv/llama_cpp:gguf-r35.4.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-15, 5.1GB)`<br>[`dustynv/llama_cpp:gguf-r36.2.0`](https://hub.docker.com/r/dustynv/llama_cpp/tags) `(2023-12-19, 5.1GB)` |
+
+</details>
+
+<details open>
+<summary><b><a id="images">CONTAINER IMAGES</a></b></summary>
+<br>
+
+| Repository/Tag | Date | Arch | Size |
+| :-- | :--: | :--: | :--: |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:ggml-r35.2.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-05` | `arm64` | `5.2GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:ggml-r35.3.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-06` | `arm64` | `5.2GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:ggml-r35.4.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-19` | `arm64` | `5.2GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:ggml-r36.2.0`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-19` | `arm64` | `5.1GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:gguf-r35.2.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-15` | `arm64` | `5.1GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:gguf-r35.3.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-19` | `arm64` | `5.2GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:gguf-r35.4.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-15` | `arm64` | `5.1GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:gguf-r36.2.0`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-12-19` | `arm64` | `5.1GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:r35.2.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-08-29` | `arm64` | `5.2GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:r35.3.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-08-15` | `arm64` | `5.2GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:r35.4.1`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2023-08-13` | `arm64` | `5.1GB` |
+| &nbsp;&nbsp;[`dustynv/llama_cpp:r36.2.0`](https://hub.docker.com/r/dustynv/llama_cpp/tags) | `2024-02-22` | `arm64` | `5.3GB` |
+
+> <sub>Container images are compatible with other minor versions of JetPack/L4T:</sub><br>
+> <sub>&nbsp;&nbsp;&nbsp;&nbsp;• L4T R32.7 containers can run on other versions of L4T R32.7 (JetPack 4.6+)</sub><br>
+> <sub>&nbsp;&nbsp;&nbsp;&nbsp;• L4T R35.x containers can run on other versions of L4T R35.x (JetPack 5.1+)</sub><br>
+</details>
+
+<details open>
+<summary><b><a id="run">RUN CONTAINER</a></b></summary>
+<br>
+
+To start the container, you can use the [`run.sh`](/docs/run.md)/[`autotag`](/docs/run.md#autotag) helpers or manually put together a [`docker run`](https://docs.docker.com/engine/reference/commandline/run/) command:
+```bash
+# automatically pull or build a compatible container image
+./run.sh $(./autotag llama_cpp)
+
+# or explicitly specify one of the container images above
+./run.sh dustynv/llama_cpp:r36.2.0
+
+# or if using 'docker run' (specify image and mounts/ect)
+sudo docker run --runtime nvidia -it --rm --network=host dustynv/llama_cpp:r36.2.0
+```
+> <sup>[`run.sh`](/docs/run.md) forwards arguments to [`docker run`](https://docs.docker.com/engine/reference/commandline/run/) with some defaults added (like `--runtime nvidia`, mounts a `/data` cache, and detects devices)</sup><br>
+> <sup>[`autotag`](/docs/run.md#autotag) finds a container image that's compatible with your version of JetPack/L4T - either locally, pulled from a registry, or by building it.</sup>
+
+To mount your own directories into the container, use the [`-v`](https://docs.docker.com/engine/reference/commandline/run/#volume) or [`--volume`](https://docs.docker.com/engine/reference/commandline/run/#volume) flags:
+```bash
+./run.sh -v /path/on/host:/path/in/container $(./autotag llama_cpp)
+```
+To launch the container running a command, as opposed to an interactive shell:
+```bash
+./run.sh $(./autotag llama_cpp) my_app --abc xyz
+```
+You can pass any options to [`run.sh`](/docs/run.md) that you would to [`docker run`](https://docs.docker.com/engine/reference/commandline/run/), and it'll print out the full command that it constructs before executing it.
+</details>
+<details open>
+<summary><b><a id="build">BUILD CONTAINER</b></summary>
+<br>
+
+If you use [`autotag`](/docs/run.md#autotag) as shown above, it'll ask to build the container for you if needed.  To manually build it, first do the [system setup](/docs/setup.md), then run:
+```bash
+./build.sh llama_cpp
+```
+The dependencies from above will be built into the container, and it'll be tested during.  See [`./build.sh --help`](/jetson_containers/build.py) for build options.
+</details>
diff --git a/packages/llm/ollama/benchmark.py b/packages/llm/ollama/benchmark.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# benchmark a quantized GGML model with ollama API
+import time
+import argparse
+import json
+import requests
+from pprint import pp
+
+# Small LLM: tinyllama
+
+DEFAULT_PROMPT = {
+  "model": "tinyllama",
+  "prompt": "Why is the sky blue?",
+  "options": {
+    "seed": 123,
+    "temperature": 0
+  },
+  "format": "json",
+  "stream": False,
+}
+
+# parse command-line arguments
+parser = argparse.ArgumentParser()
+
+parser.add_argument('-m', '--model', type=str, default='tinyllama', required=True, help="name of model to run")
+parser.add_argument('-p', '--prompt', type=str, default=DEFAULT_PROMPT.get("prompt"))
+parser.add_argument('--runs', type=int, default=2, help='the number of benchmark timing iterations')
+parser.add_argument('--OLLAMA_PID', type=str, default="", required=True, help='the pid of the ollama process')
+
+args = parser.parse_args()
+
+print(args)
+
+data = DEFAULT_PROMPT.copy()
+data['prompt'] = args.prompt
+
+def get_max_memory_usage(PID: str = "self") -> None:
+    ''' Maximum memory usage in bytes '''
+    with open(f'/proc/{PID}/status', encoding='utf-8') as f:
+        memusage = f.read().split('VmPeak:')[1].split('\n')[0][:-3]
+
+    return int(memusage.strip()) / 1024
+
+def send_test_prompt(json_data: dict, url:str = "") -> requests.Response:
+    ''' send a test prompt to local ollama container '''
+    if not url:
+        url = "127.0.0.1:11434"
+    return requests.post(url, json=json.dumps(json_data))
+
+def run_benchmark(runs: int, json_data: dict, test_url: str = "") -> None:
+    ''' run the benchmark '''
+    time_avg = 0.0
+    for run in range(runs):
+        time_begin = time.perf_counter()
+        response = send_test_prompt(json_data, test_url)
+        time_elapsed = (time.perf_counter() - time_begin)
+
+        if not response.ok:
+            pp(f'received error code from api service: {response.status_code}')
+            continue
+        pp(f'[+] run #{run}')
+        pp(f'[-] model: {json_data["model"]}, prompt: {json_data["prompt"]}')
+        pp(f'[-] response: {json.loads(response.text)["response"].strip()}, elapsed time: {time_elapsed}')
+        time_avg += float(time_elapsed)
+
+    pp(f'[+] peak ram used: {get_max_memory_usage(args.OLLAMA_PID) / 1024 / 1024} MB')
+    if runs > 0:
+        pp(f'[+] average time: {time_avg / runs:.2f}')
+
+run_benchmark(args.runs, data, "")
diff --git a/packages/llm/ollama/benchmark.sh b/packages/llm/ollama/benchmark.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+-set xe
+
+OLLAMA_PID=$(ps -ef | grep 'ollama serve' | awk '{ print $1 }')
+
+if [ -z OLLAMA_PID ]; then
+    /bin/ollama serve &
+    OLLAMA_PID=$(ps -ef | grep 'ollama serve' | awk '{ print $1 }')
+fi
+
+if [ -z OLLAMA_PID ]; then
+    echo "ollama binary not running. exiting"
+    exit 1
+fi
+
+python3 /benchmark.py --OLLAMA_PID ${OLLAMA_PID}
+
+kill ${OLLAMA_PID}
diff --git a/packages/llm/ollama/config.py b/packages/llm/ollama/config.py
@@ -0,0 +1,17 @@
+
+import copy
+from jetson_containers import CUDA_ARCHITECTURES, JETPACK_VERSION
+
+ollama = copy.deepcopy(package)
+ollama['name'] = 'ollama'
+ollama['alias'] = 'ollama'
+ollama['build_args'] = {
+    'CMAKE_CUDA_ARCHITECTURES': ';'.join([str(x) for x in CUDA_ARCHITECTURES]),
+    'JETPACK_VERSION': str(JETPACK_VERSION),
+    'OLLAMA_REPO': 'ollama/ollama',
+    'OLLAMA_BRANCH': 'main',
+    'GOLANG_VERSION': '1.22.1',
+    'CMAKE_VERSION': '3.22.1',
+}
+
+package = [ollama]
diff --git a/packages/llm/ollama/docs.md b/packages/llm/ollama/docs.md
@@ -0,0 +1,19 @@
+
+* ollama from https://github.com/ollama/ollama with CUDA enabled (found under `/bin/ollama`)
+
+# Container Usage
+
+Run the container as a daemon in the background
+`docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama dusty-nv/ollama`
+
+Start the Ollama front-end with your desired model (for example: mistral 7b)
+`docker run -it --rm dusty-nv/ollama /bin/ollama run mistral`
+
+### Memory Usage
+
+| Model                                                                           |          Quantization         | Memory (MB) |
+|---------------------------------------------------------------------------------|:-----------------------------:|:-----------:|
+| [`TheBloke/Llama-2-7B-GGUF`](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)   |  `llama-2-7b.Q4_K_S.gguf`     |    5,268    |
+| [`TheBloke/Llama-2-13B-GGUF`](https://huggingface.co/TheBloke/Llama-2-13B-GGUF) | `llama-2-13b.Q4_K_S.gguf`     |    8,609    |
+| [`TheBloke/LLaMA-30b-GGUF`](https://huggingface.co/TheBloke/LLaMA-30b-GGUF)     | `llama-30b.Q4_K_S.gguf`       |    19,045   |
+| [`TheBloke/Llama-2-70B-GGUF`](https://huggingface.co/TheBloke/Llama-2-70B-GGUF) | `llama-2-70b.Q4_K_S.gguf`     |    37,655   |
diff --git a/packages/llm/ollama/ollama_deps.sh b/packages/llm/ollama/ollama_deps.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -ex
+
+INSTALL_ARCH=$(uname -m)
+if [ -z "${INSTALL_ARCH}" ]; then
+    echo "no architecture detected"
+    exit 1
+fi
+
+apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \
+    ca-certificates \
+    git \
+    gcc-10 \
+    g++-10
+
+if [ -n "${CMAKE_VERSION}" ]; then
+    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${INSTALL_ARCH}.tar.gz | tar -zx -C /usr --strip-components 1
+fi
+
+if [ -n "${GOLANG_VERSION}" ]; then
+    GO_ARCH="arm64"
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local
+    ln -s /usr/local/go/bin/go /usr/local/bin/go
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt
+fi
+
+rm -rf /var/lib/apt/lists/* && apt-get clean
diff --git a/packages/llm/ollama/test.sh b/packages/llm/ollama/test.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+-set ex
+file /bin/ollama