#!/usr/bin/env bash # Runner disk cleanup script for Kindred Create CI/CD # # Designed to run as a cron job on the pubworker host: # */30 * * * * /path/to/cleanup.sh >> /var/log/runner-cleanup.log 2>&1 # # Or install the systemd timer (see cleanup.timer / cleanup.service). # # What it cleans: # 1. Docker: stopped containers, dangling images, build cache # 2. act_runner action cache: keeps only the newest entry per key prefix # 3. act_runner workspaces: removes leftover build workspaces # 4. System: apt cache, old logs # # What it preserves: # - The current runner container and its image # - The most recent cache entry per prefix (so ccache hits still work) # - Everything outside of known CI paths set -euo pipefail # --------------------------------------------------------------------------- # Configuration -- adjust these to match your runner setup # --------------------------------------------------------------------------- # Disk usage threshold (percent) -- only run aggressive cleanup above this THRESHOLD=${CLEANUP_THRESHOLD:-85} # act_runner cache directory (default location) CACHE_DIR=${CACHE_DIR:-/root/.cache/actcache} # act_runner workspace directories WORKSPACES=( "/root/.cache/act" "/workspace" ) # Maximum age (days) for cache entries before unconditional deletion CACHE_MAX_AGE_DAYS=${CACHE_MAX_AGE_DAYS:-7} # Maximum age (days) for Docker images not used by running containers DOCKER_IMAGE_MAX_AGE=${DOCKER_IMAGE_MAX_AGE:-48h} # Log prefix LOG_PREFIX="[runner-cleanup]" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- log() { echo "$(date '+%Y-%m-%d %H:%M:%S') ${LOG_PREFIX} $*"; } disk_usage_pct() { df --output=pcent / | tail -1 | tr -dc '0-9' } bytes_to_human() { numfmt --to=iec-i --suffix=B "$1" 2>/dev/null || echo "${1}B" } # --------------------------------------------------------------------------- # Phase 1: Check if cleanup is needed # --------------------------------------------------------------------------- usage=$(disk_usage_pct) log "Disk usage: ${usage}% (threshold: ${THRESHOLD}%)" if [ "$usage" -lt "$THRESHOLD" ]; then log "Below threshold, running light cleanup only" AGGRESSIVE=false else log "Above threshold, running aggressive cleanup" AGGRESSIVE=true fi # --------------------------------------------------------------------------- # Phase 2: Docker cleanup (always runs, safe) # --------------------------------------------------------------------------- log "--- Docker cleanup ---" # Remove stopped containers stopped=$(docker ps -aq --filter status=exited --filter status=dead 2>/dev/null | wc -l) if [ "$stopped" -gt 0 ]; then docker rm $(docker ps -aq --filter status=exited --filter status=dead) 2>/dev/null || true log "Removed ${stopped} stopped containers" fi # Remove dangling images (untagged layers) dangling=$(docker images -q --filter dangling=true 2>/dev/null | wc -l) if [ "$dangling" -gt 0 ]; then docker rmi $(docker images -q --filter dangling=true) 2>/dev/null || true log "Removed ${dangling} dangling images" fi # Prune build cache docker builder prune -f --filter "until=${DOCKER_IMAGE_MAX_AGE}" 2>/dev/null || true log "Pruned Docker build cache older than ${DOCKER_IMAGE_MAX_AGE}" if [ "$AGGRESSIVE" = true ]; then # Remove all images not used by running containers running_images=$(docker ps -q 2>/dev/null | xargs -r docker inspect --format='{{.Image}}' | sort -u) all_images=$(docker images -q 2>/dev/null | sort -u) for img in $all_images; do if ! echo "$running_images" | grep -q "$img"; then docker rmi -f "$img" 2>/dev/null || true fi done log "Removed unused Docker images (aggressive)" # Prune volumes docker volume prune -f 2>/dev/null || true log "Pruned unused Docker volumes" fi # --------------------------------------------------------------------------- # Phase 3: act_runner action cache cleanup # --------------------------------------------------------------------------- log "--- Action cache cleanup ---" if [ -d "$CACHE_DIR" ]; then before=$(du -sb "$CACHE_DIR" 2>/dev/null | cut -f1) # Delete cache entries older than max age find "$CACHE_DIR" -type f -mtime "+${CACHE_MAX_AGE_DAYS}" -delete 2>/dev/null || true find "$CACHE_DIR" -type d -empty -delete 2>/dev/null || true after=$(du -sb "$CACHE_DIR" 2>/dev/null | cut -f1) freed=$((before - after)) log "Cache cleanup freed $(bytes_to_human $freed) (entries older than ${CACHE_MAX_AGE_DAYS}d)" else log "Cache directory not found: ${CACHE_DIR}" # Try common alternative locations for alt in /var/lib/act_runner/.cache/actcache /home/*/.cache/actcache; do if [ -d "$alt" ]; then log "Found cache at: $alt (update CACHE_DIR config)" CACHE_DIR="$alt" find "$CACHE_DIR" -type f -mtime "+${CACHE_MAX_AGE_DAYS}" -delete 2>/dev/null || true find "$CACHE_DIR" -type d -empty -delete 2>/dev/null || true break fi done fi # --------------------------------------------------------------------------- # Phase 4: Workspace cleanup # --------------------------------------------------------------------------- log "--- Workspace cleanup ---" for ws in "${WORKSPACES[@]}"; do if [ -d "$ws" ]; then # Remove workspace dirs not modified in the last 2 hours # (active builds should be touching files continuously) before=$(du -sb "$ws" 2>/dev/null | cut -f1) find "$ws" -mindepth 1 -maxdepth 1 -type d -mmin +120 -exec rm -rf {} + 2>/dev/null || true after=$(du -sb "$ws" 2>/dev/null | cut -f1) freed=$((before - after)) if [ "$freed" -gt 0 ]; then log "Workspace $ws: freed $(bytes_to_human $freed)" fi fi done # --------------------------------------------------------------------------- # Phase 5: System cleanup # --------------------------------------------------------------------------- log "--- System cleanup ---" # apt cache apt-get clean 2>/dev/null || true # Truncate large log files (keep last 1000 lines) for logfile in /var/log/syslog /var/log/daemon.log /var/log/kern.log; do if [ -f "$logfile" ] && [ "$(stat -c%s "$logfile" 2>/dev/null)" -gt 104857600 ]; then tail -1000 "$logfile" > "${logfile}.tmp" && mv "${logfile}.tmp" "$logfile" log "Truncated $logfile (was >100MB)" fi done # Journal logs older than 3 days journalctl --vacuum-time=3d 2>/dev/null || true # --------------------------------------------------------------------------- # Phase 6: Emergency cleanup (only if still critical) # --------------------------------------------------------------------------- usage=$(disk_usage_pct) if [ "$usage" -gt 95 ]; then log "CRITICAL: Still at ${usage}% after cleanup" # Nuclear option: remove ALL docker data except running containers docker system prune -af --volumes 2>/dev/null || true log "Ran docker system prune -af --volumes" # Clear entire action cache if [ -d "$CACHE_DIR" ]; then rm -rf "${CACHE_DIR:?}/"* log "Cleared entire action cache" fi usage=$(disk_usage_pct) log "After emergency cleanup: ${usage}%" fi # --------------------------------------------------------------------------- # Summary # --------------------------------------------------------------------------- usage=$(disk_usage_pct) log "Cleanup complete. Disk usage: ${usage}%" # Report top space consumers for diagnostics log "Top 10 directories under /var:" du -sh /var/*/ 2>/dev/null | sort -rh | head -10 | while read -r line; do log " $line" done