Checking out how to clean up our docker story to be easier to use

jakep-allenai · jakep-allenai · commit 4dd7d1200604 · 2025-11-17T19:51:40.000Z
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -211,7 +211,7 @@ jobs:
             dist/*
 
   docker-build:
-    name: Build and Push Docker Image
+    name: Build and Push Docker Images
     runs-on: large-olmocr-runner
     needs: [release]
     if: startsWith(github.ref, 'refs/tags/')
@@ -246,8 +246,8 @@ jobs:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
 
-      - name: Extract metadata
-        id: meta
+      - name: Extract metadata for base image
+        id: meta-base
         uses: docker/metadata-action@v5
         with:
           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
@@ -258,18 +258,42 @@ jobs:
           flavor: |
             latest=true
 
-      - name: Build and push Docker image
+      - name: Build and push base Docker image
         uses: docker/build-push-action@v5
         with:
           context: .
           file: ./Dockerfile
           push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta-base.outputs.tags }}
+          labels: ${{ steps.meta-base.outputs.labels }}
           platforms: linux/amd64
           outputs: type=registry
           no-cache: true
 
+      - name: Extract metadata for image with model
+        id: meta-with-model
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=tag,suffix=-with-model
+            type=semver,pattern={{version}}-with-model
+            type=semver,pattern={{major}}.{{minor}}-with-model
+          flavor: |
+            latest=auto
+            suffix=-with-model,onlatest=true
+
+      - name: Build and push Docker image with model
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./Dockerfile.with-model
+          push: true
+          tags: ${{ steps.meta-with-model.outputs.tags }}
+          labels: ${{ steps.meta-with-model.outputs.labels }}
+          platforms: linux/amd64
+          outputs: type=registry
+
       # jakep: push to beaker can't work because of limitted disk space on these runners
       # jakep: (you can try by setting load: true above, but you'll need a larger runner)
       # - name: Setup Beaker CLI
diff --git a/Dockerfile b/Dockerfile
@@ -53,4 +53,7 @@ RUN uv pip install --system --no-cache ".[bench]"
 RUN playwright install-deps
 RUN playwright install chromium
 
-RUN python3 -m olmocr.pipeline --help
+RUN python3 -m olmocr.pipeline --help
+
+# Override the vLLM base image's entrypoint to allow interactive bash access
+ENTRYPOINT ["/bin/bash"]
diff --git a/Dockerfile.with-model b/Dockerfile.with-model
@@ -0,0 +1,30 @@
+# Build from the base olmocr image
+FROM alleninstituteforai/olmocr:latest
+
+# Allow specifying which model to include at build time
+# Default to the latest olmOCR model
+ARG MODEL_NAME=allenai/olmOCR-2-7B-1025-FP8
+
+# Set model cache directory to a fixed location in the image
+ENV HF_HOME=/opt/models
+ENV TRANSFORMERS_CACHE=/opt/models
+ENV OLMOCR_MODEL=${MODEL_NAME}
+
+# Pre-download the olmOCR model into the image
+# This adds ~16GB to the image size but eliminates runtime downloads
+RUN python -c "from huggingface_hub import snapshot_download; \
+    import os; \
+    model = os.environ.get('OLMOCR_MODEL'); \
+    print(f'Downloading model: {model}'); \
+    snapshot_download(model, cache_dir='/opt/models/hub'); \
+    print(f'Model {model} successfully downloaded and cached in image')"
+
+# Verify the model is present
+RUN python -c "import os; \
+    model = os.environ.get('OLMOCR_MODEL').replace('/', '--'); \
+    model_path = f'/opt/models/hub/models--{model}'; \
+    assert os.path.exists(model_path), f'Model not found at {model_path}'; \
+    size = sum(os.path.getsize(os.path.join(dp, f)) for dp, dn, fn in os.walk(model_path) for f in fn) / (1024**3); \
+    print(f'Model size: {size:.2f} GB')"
+
+# The entrypoint is already set to /bin/bash in the base image