Skip to content

Commit 4dd7d12

Browse files
committed
Checking out how to clean up our docker story to be easier to use
1 parent 1ce2a6f commit 4dd7d12

File tree

3 files changed

+64
-7
lines changed

3 files changed

+64
-7
lines changed

.github/workflows/main.yml

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ jobs:
211211
dist/*
212212
213213
docker-build:
214-
name: Build and Push Docker Image
214+
name: Build and Push Docker Images
215215
runs-on: large-olmocr-runner
216216
needs: [release]
217217
if: startsWith(github.ref, 'refs/tags/')
@@ -246,8 +246,8 @@ jobs:
246246
username: ${{ secrets.DOCKER_USERNAME }}
247247
password: ${{ secrets.DOCKER_PASSWORD }}
248248

249-
- name: Extract metadata
250-
id: meta
249+
- name: Extract metadata for base image
250+
id: meta-base
251251
uses: docker/metadata-action@v5
252252
with:
253253
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
@@ -258,18 +258,42 @@ jobs:
258258
flavor: |
259259
latest=true
260260
261-
- name: Build and push Docker image
261+
- name: Build and push base Docker image
262262
uses: docker/build-push-action@v5
263263
with:
264264
context: .
265265
file: ./Dockerfile
266266
push: true
267-
tags: ${{ steps.meta.outputs.tags }}
268-
labels: ${{ steps.meta.outputs.labels }}
267+
tags: ${{ steps.meta-base.outputs.tags }}
268+
labels: ${{ steps.meta-base.outputs.labels }}
269269
platforms: linux/amd64
270270
outputs: type=registry
271271
no-cache: true
272272

273+
- name: Extract metadata for image with model
274+
id: meta-with-model
275+
uses: docker/metadata-action@v5
276+
with:
277+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
278+
tags: |
279+
type=ref,event=tag,suffix=-with-model
280+
type=semver,pattern={{version}}-with-model
281+
type=semver,pattern={{major}}.{{minor}}-with-model
282+
flavor: |
283+
latest=auto
284+
suffix=-with-model,onlatest=true
285+
286+
- name: Build and push Docker image with model
287+
uses: docker/build-push-action@v5
288+
with:
289+
context: .
290+
file: ./Dockerfile.with-model
291+
push: true
292+
tags: ${{ steps.meta-with-model.outputs.tags }}
293+
labels: ${{ steps.meta-with-model.outputs.labels }}
294+
platforms: linux/amd64
295+
outputs: type=registry
296+
273297
# jakep: push to beaker can't work because of limitted disk space on these runners
274298
# jakep: (you can try by setting load: true above, but you'll need a larger runner)
275299
# - name: Setup Beaker CLI

Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,7 @@ RUN uv pip install --system --no-cache ".[bench]"
5353
RUN playwright install-deps
5454
RUN playwright install chromium
5555

56-
RUN python3 -m olmocr.pipeline --help
56+
RUN python3 -m olmocr.pipeline --help
57+
58+
# Override the vLLM base image's entrypoint to allow interactive bash access
59+
ENTRYPOINT ["/bin/bash"]

Dockerfile.with-model

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Build from the base olmocr image
2+
FROM alleninstituteforai/olmocr:latest
3+
4+
# Allow specifying which model to include at build time
5+
# Default to the latest olmOCR model
6+
ARG MODEL_NAME=allenai/olmOCR-2-7B-1025-FP8
7+
8+
# Set model cache directory to a fixed location in the image
9+
ENV HF_HOME=/opt/models
10+
ENV TRANSFORMERS_CACHE=/opt/models
11+
ENV OLMOCR_MODEL=${MODEL_NAME}
12+
13+
# Pre-download the olmOCR model into the image
14+
# This adds ~16GB to the image size but eliminates runtime downloads
15+
RUN python -c "from huggingface_hub import snapshot_download; \
16+
import os; \
17+
model = os.environ.get('OLMOCR_MODEL'); \
18+
print(f'Downloading model: {model}'); \
19+
snapshot_download(model, cache_dir='/opt/models/hub'); \
20+
print(f'Model {model} successfully downloaded and cached in image')"
21+
22+
# Verify the model is present
23+
RUN python -c "import os; \
24+
model = os.environ.get('OLMOCR_MODEL').replace('/', '--'); \
25+
model_path = f'/opt/models/hub/models--{model}'; \
26+
assert os.path.exists(model_path), f'Model not found at {model_path}'; \
27+
size = sum(os.path.getsize(os.path.join(dp, f)) for dp, dn, fn in os.walk(model_path) for f in fn) / (1024**3); \
28+
print(f'Model size: {size:.2f} GB')"
29+
30+
# The entrypoint is already set to /bin/bash in the base image

0 commit comments

Comments
 (0)