This guide covers common use cases for using Smash and provides examples for basic operations, advanced filtering, performance tuning, and more.
It is designed to help users effectively manage duplicate files across various scenarios.
- Basic Operations
- Advanced Filtering
- Performance Tuning
- Working with Large Files
- Report Analysis
- Common Use Cases
- Using Docker
- Troubleshooting
# Basic scan (current directory only)
smash
# Recursive scan
smash -r
# Scan specific directories
smash -r ~/Documents ~/Downloads# Silent mode - no output except report
smash -r --silent -o report.json ~/data
# Verbose mode - detailed processing info
smash -r --verbose ~/data
# Disable progress indicators
smash -r --no-progress ~/data# Auto-generated filename
smash -r ~/data
# Custom report filename
smash -r -o duplicates.json ~/data
# Disable report generation (console output only)
smash -r --no-output ~/data# Only files larger than 1MB
smash -r --min-size=1048576 ~/data
# Only files smaller than 100MB
smash -r --max-size=104857600 ~/data
# Files between 1MB and 100MB
smash -r --min-size=1048576 --max-size=104857600 ~/data# Exclude multiple directories
smash -r --exclude-dir=.git,.svn,node_modules ~/projects
# Exclude file patterns
smash -r --exclude-file="*.log,*.tmp,*.cache" ~/data
# Complex exclusions
smash -r \
--exclude-dir=.git,target,build \
--exclude-file="*.class,*.o,*.pyc" \
~/workspace# Include hidden files
smash -r --ignore-hidden=false ~/data
# Include system files
smash -r --ignore-system=false ~/data
# Include empty files in analysis
smash -r --ignore-empty=false ~/data# Low-resource system
smash -r --max-workers=4 --max-threads=4 ~/data
# High-performance system
smash -r --max-workers=32 --max-threads=32 ~/data
# I/O bound operations (network drives)
smash -r --max-workers=8 ~/network-share# More slices for very large files
smash -r --slices=8 --slice-size=16384 ~/videos
# Disable slicing for small files only
smash -r --slice-threshold=1048576 ~/documents
# Full file hashing (no slicing)
smash -r --disable-slicing ~/critical-dataSee Algorithm Guide for details on available algorithms.
# Fastest (default)
smash -r --algorithm=xxhash ~/data
# Balanced speed/distribution
smash -r --algorithm=murmur3 ~/data
# Cryptographic verification
smash -r --algorithm=sha256 ~/data# Optimized for large video files
smash -r \
--slices=8 \
--slice-size=32768 \
--min-size=10485760 \
--exclude-file="*.srt,*.sub,*.idx" \
~/videos# Optimized for photos with metadata
smash -r \
--disable-meta=false \
--exclude-file="*.xmp,Thumbs.db,.DS_Store" \
--min-size=10240 \
~/Photos# Optimized for code repos
smash -r \
--exclude-dir=.git,node_modules,target,dist \
--exclude-file="*.log,*.lock,*.sum" \
--disable-autotext=false \
~/repos# List all duplicate files
jq -r '.analysis.dupes[].files[].path' report.json
# Count duplicate sets
jq '.analysis.dupes | length' report.json
# Total space wasted
jq '.analysis.summary.spaceWasted' report.json# Files larger than 100MB
jq '.analysis.dupes[] | select(.files[0].size > 104857600)' report.json
# Duplicates in specific directory
jq '.analysis.dupes[].files[] | select(.path | startswith("/home/user/Downloads"))' report.json
# Group by hash
jq -r '.analysis.dupes[] | "\(.files[0].hash): \(.files | map(.path) | join(", "))"' report.json# List all empty files
jq -r '.analysis.empty[].path' report.json
# Count empty files by directory
jq -r '.analysis.empty[].path | split("/")[:-1] | join("/")' report.json | sort | uniq -c# Generate rm commands for duplicates (keeping first)
jq -r '.analysis.dupes[].files[1:][].path | "rm -i \"\(.)\""' report.json > remove_dupes.sh
# Create hardlink script
jq -r '.analysis.dupes[] |
.files[0].path as $keep |
.files[1:][] |
"ln -f \"\($keep)\" \"\(.path)\""' report.json > create_links.sh# Compare backup directories
smash -r --show-duplicates \
/backup/2024-01 \
/backup/2024-02 \
/backup/2024-03
# Find duplicates across local and external
smash -r ~/Documents /mnt/usb-backup/Documents# Music library
smash -r \
--algorithm=murmur3 \
--exclude-file="*.jpg,*.png,*.txt,*.cue" \
--min-size=1048576 \
~/Music
# Photo library with RAW files
smash -r \
--slices=6 \
--exclude-file="*.xmp,*.pp3" \
~/Pictures# Find duplicate dependencies
smash -r \
--exclude-dir=.git \
--show-top=50 \
~/go/pkg/mod \
~/.cargo/registry
# Clean build artifacts
smash -r \
--exclude-dir=.git,src \
--min-size=1048576 \
~/projectsSmash can be run in a Docker container, which is useful for consistent environments, CI/CD pipelines, or when you don't want to install the binary directly. The official Docker image is available on GitHub Container Registry.
Tip
Use the -t flag to allocate a pseudo-TTY for better output formatting with Docker.
Leave it out if you don't need TTY support (e.g., in scripts/pipelines).
We use the --rm flag to automatically remove the container after it exits, keeping
your environment clean in these examples.
# Pull the latest image
docker pull ghcr.io/thushan/smash:latest
# Basic scan of current directory
docker run -t --rm -v "$PWD:/data" ghcr.io/thushan/smash:latest -r /data
# Scan with JSON output
docker run -t --rm -v "$PWD:/data" ghcr.io/thushan/smash:latest -r -o /data/report.json /dataDocker containers are isolated from your host filesystem. You need to mount directories as volumes to scan them:
# Mount current directory as /data (read-only recommended for scanning)
docker run -t --rm -v "$PWD:/data:ro" ghcr.io/thushan/smash:latest -r /data
# Mount multiple directories
docker run -t --rm \
-v "$HOME/Documents:/docs:ro" \
-v "$HOME/Pictures:/pics:ro" \
ghcr.io/thushan/smash:latest -r /docs /pics
# Windows PowerShell syntax
docker run -t --rm -v "${PWD}:/data:ro" ghcr.io/thushan/smash:latest -r /dataWhen generating reports, the output file must be written to a mounted volume. The container includes a built-in /output directory that is already writable, but you need to mount it to a host directory to access the reports:
# Create a local output directory and mount it
mkdir -p ./output
docker run -t --rm \
-v "$PWD:/data:ro" \
-v "$PWD/output:/output" \
ghcr.io/thushan/smash:latest -r -o /output/report.json /data
# Now you can read the report on your host
cat ./output/report.json | jq '.analysis.summary'
# Use different output directories for different scans
mkdir -p ./reports/photos ./reports/documents
docker run -t --rm \
-v "$HOME/Pictures:/data:ro" \
-v "$PWD/reports/photos:/output" \
ghcr.io/thushan/smash:latest -r -o /output/duplicates.json /data
# Alternative: write directly to mounted data directory
docker run -t --rm \
-v "$PWD:/data" \
ghcr.io/thushan/smash:latest -r -o /data/report.json /data
# Note: This requires the data mount to NOT be read-onlyNote
The /output directory inside the container is pre-configured with appropriate permissions for the non-root user.
When mounting to a host directory, ensure the host directory exists and is accessible.
docker run -t --rm -v "$PWD:/data:ro" ghcr.io/thushan/smash:latest \
-r --algorithm=murmur3 \
--exclude-dir=.git,node_modules \
--min-size=1048576 \
-o /data/large-files.json /data# Limit resources for container
docker run -t --rm \
--cpus="2.0" \
--memory="1g" \
-v "$PWD:/data:ro" \
ghcr.io/thushan/smash:latest \
-r --max-workers=8 --max-threads=8 /data# Use a specific version tag
docker pull ghcr.io/thushan/smash:v1.0.0
docker run -t --rm -v "$PWD:/data:ro" ghcr.io/thushan/smash:v1.0.0 -r /dataFor CI/CD pipelines, Docker provides a consistent environment:
# GitHub Actions example
- name: Find duplicates
run: |
docker run --rm \
-v ${{ github.workspace }}:/workspace:ro \
-v ${{ github.workspace }}/reports:/output \
ghcr.io/thushan/smash:latest \
-r --silent -o /output/duplicates.json /workspace# GitLab CI example
find-duplicates:
image: ghcr.io/thushan/smash:latest
script:
- smash -r --silent -o duplicates.json .
artifacts:
paths:
- duplicates.jsonFor complex setups, you can use Docker Compose:
# docker-compose.yml
version: '3.8'
services:
smash:
image: ghcr.io/thushan/smash:latest
volumes:
- ./data:/data:ro
- ./reports:/output
command: -r --silent -o /output/report.json /dataRun with: docker-compose run --rm smash
- Use
-tfor interactive runs - This ensures proper output formatting and colours, but can be omitted in scripts/pipelines. - Use
--rmto auto-cleanup - Prevents accumulation of stopped containers - Mount as read-only (
:ro) - Smash only reads files, so use:rofor safety - Create output directories first - Ensure output directories exist and are writable
- Use specific versions in production - Pin to specific tags rather than
latest
No coloured output
# Always use -t flag
docker run -t --rm -v "$PWD:/data" ghcr.io/thushan/smash:latest -r /dataPermission denied errors
# Ensure output directory is writable
chmod 755 ./reports
docker run -t --rm -v "$PWD:/data:ro" -v "$PWD/reports:/output" \
ghcr.io/thushan/smash:latest -r -o /output/report.json /dataCan't find files
# Check your volume mounts - paths inside container are different
docker run -t --rm -v "$HOME/Documents:/docs" ghcr.io/thushan/smash:latest ls /docsSlow scanning on network drives
# Reduce workers and increase slice size
smash -r --max-workers=4 --slice-size=65536 /mnt/nasHigh memory usage
# Reduce concurrent operations
smash -r --max-workers=8 --show-top=10 ~/dataToo many open files error
# Reduce workers or increase system limits
ulimit -n 4096
smash -r --max-workers=8 ~/dataFalse positives
# Use more slices and full file hashing for critical data
smash -r --slices=16 --disable-slicing ~/importantMissing duplicates
# Check exclusion rules and size limits
smash -r \
--ignore-hidden=false \
--ignore-system=false \
--min-size=0 \
~/dataReport too large
# Limit output to significant duplicates
smash -r --min-size=10485760 --show-top=100 ~/dataCan't parse report
# Validate JSON
jq . report.json > /dev/null || echo "Invalid JSON"
# Pretty print for reading
jq . report.json > report-formatted.jsonSmash respects standard environment variables:
# Disable colors in output
NO_COLOR=1 smash -r ~/data
# Force colors in non-TTY environment
FORCE_COLOR=1 smash -r ~/data | tee scan.log#!/bin/bash
# /etc/cron.daily/smash-scan
REPORT_DIR="/var/reports/smash"
mkdir -p "$REPORT_DIR"
/usr/local/bin/smash -r --silent \
-o "$REPORT_DIR/scan-$(date +%Y%m%d).json" \
/data#!/bin/bash
# .git/hooks/pre-commit
smash --no-output --exclude-dir=.git . || {
echo "Duplicate files detected!"
exit 1
}#!/bin/bash
# monitor-duplicates.sh
THRESHOLD=1073741824 # 1GB
REPORT=$(mktemp)
smash -r --silent -o "$REPORT" "$1"
WASTE=$(jq '.analysis.summary.spaceWasted // 0' "$REPORT")
if [ "$WASTE" -gt "$THRESHOLD" ]; then
echo "Warning: $WASTE bytes wasted in duplicates"
jq -r '.analysis.dupes[] | "\(.files | length) copies: \(.files[0].path)"' "$REPORT"
fi
rm -f "$REPORT"