From 8ba7b73aa83436f6a821eb4456c2bb32bc9e2414 Mon Sep 17 00:00:00 2001 From: forbes Date: Sat, 7 Feb 2026 08:18:00 -0600 Subject: [PATCH] fix(ci): fix ccache strategy and add runner cleanup - Cache key: replace run_id (unique per build) with date-based key so entries are reused within the same day and rotate daily - Skip cache save on exact hit (act_runner can't overwrite keys) - build.sh: set CCACHE_DIR/CCACHE_BASEDIR inside rattler-build's isolated env so release builds actually use the cached directory - build.sh: print ccache stats at end for diagnostics - Add disk space cleanup step to build.yml (matching release.yml) - Remove cross-build cache fallback in release.yml (different -O flags) - Add runner cleanup daemon (.gitea/runner/) with systemd timer to purge stale cache entries, Docker data, and old workspaces --- .gitea/runner/cleanup.service | 11 ++ .gitea/runner/cleanup.sh | 220 +++++++++++++++++++++++++++++++++ .gitea/runner/cleanup.timer | 10 ++ .gitea/workflows/build.yml | 22 +++- .gitea/workflows/release.yml | 14 ++- package/rattler-build/build.sh | 15 +++ 6 files changed, 285 insertions(+), 7 deletions(-) create mode 100644 .gitea/runner/cleanup.service create mode 100755 .gitea/runner/cleanup.sh create mode 100644 .gitea/runner/cleanup.timer diff --git a/.gitea/runner/cleanup.service b/.gitea/runner/cleanup.service new file mode 100644 index 0000000000..4fdd523100 --- /dev/null +++ b/.gitea/runner/cleanup.service @@ -0,0 +1,11 @@ +[Unit] +Description=Kindred Create CI runner disk cleanup +After=docker.service + +[Service] +Type=oneshot +ExecStart=/opt/runner/cleanup.sh +Environment=CLEANUP_THRESHOLD=85 +Environment=CACHE_MAX_AGE_DAYS=7 +StandardOutput=append:/var/log/runner-cleanup.log +StandardError=append:/var/log/runner-cleanup.log diff --git a/.gitea/runner/cleanup.sh b/.gitea/runner/cleanup.sh new file mode 100755 index 0000000000..23c42c7365 --- /dev/null +++ b/.gitea/runner/cleanup.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# Runner disk cleanup script for Kindred Create CI/CD +# +# Designed to run as a cron job on the pubworker host: +# */30 * * * * /path/to/cleanup.sh >> /var/log/runner-cleanup.log 2>&1 +# +# Or install the systemd timer (see cleanup.timer / cleanup.service). +# +# What it cleans: +# 1. Docker: stopped containers, dangling images, build cache +# 2. act_runner action cache: keeps only the newest entry per key prefix +# 3. act_runner workspaces: removes leftover build workspaces +# 4. System: apt cache, old logs +# +# What it preserves: +# - The current runner container and its image +# - The most recent cache entry per prefix (so ccache hits still work) +# - Everything outside of known CI paths + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Configuration -- adjust these to match your runner setup +# --------------------------------------------------------------------------- + +# Disk usage threshold (percent) -- only run aggressive cleanup above this +THRESHOLD=${CLEANUP_THRESHOLD:-85} + +# act_runner cache directory (default location) +CACHE_DIR=${CACHE_DIR:-/root/.cache/actcache} + +# act_runner workspace directories +WORKSPACES=( + "/root/.cache/act" + "/workspace" +) + +# Maximum age (days) for cache entries before unconditional deletion +CACHE_MAX_AGE_DAYS=${CACHE_MAX_AGE_DAYS:-7} + +# Maximum age (days) for Docker images not used by running containers +DOCKER_IMAGE_MAX_AGE=${DOCKER_IMAGE_MAX_AGE:-48h} + +# Log prefix +LOG_PREFIX="[runner-cleanup]" + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +log() { echo "$(date '+%Y-%m-%d %H:%M:%S') ${LOG_PREFIX} $*"; } + +disk_usage_pct() { + df --output=pcent / | tail -1 | tr -dc '0-9' +} + +bytes_to_human() { + numfmt --to=iec-i --suffix=B "$1" 2>/dev/null || echo "${1}B" +} + +# --------------------------------------------------------------------------- +# Phase 1: Check if cleanup is needed +# --------------------------------------------------------------------------- + +usage=$(disk_usage_pct) +log "Disk usage: ${usage}% (threshold: ${THRESHOLD}%)" + +if [ "$usage" -lt "$THRESHOLD" ]; then + log "Below threshold, running light cleanup only" + AGGRESSIVE=false +else + log "Above threshold, running aggressive cleanup" + AGGRESSIVE=true +fi + +# --------------------------------------------------------------------------- +# Phase 2: Docker cleanup (always runs, safe) +# --------------------------------------------------------------------------- + +log "--- Docker cleanup ---" + +# Remove stopped containers +stopped=$(docker ps -aq --filter status=exited --filter status=dead 2>/dev/null | wc -l) +if [ "$stopped" -gt 0 ]; then + docker rm $(docker ps -aq --filter status=exited --filter status=dead) 2>/dev/null || true + log "Removed ${stopped} stopped containers" +fi + +# Remove dangling images (untagged layers) +dangling=$(docker images -q --filter dangling=true 2>/dev/null | wc -l) +if [ "$dangling" -gt 0 ]; then + docker rmi $(docker images -q --filter dangling=true) 2>/dev/null || true + log "Removed ${dangling} dangling images" +fi + +# Prune build cache +docker builder prune -f --filter "until=${DOCKER_IMAGE_MAX_AGE}" 2>/dev/null || true +log "Pruned Docker build cache older than ${DOCKER_IMAGE_MAX_AGE}" + +if [ "$AGGRESSIVE" = true ]; then + # Remove all images not used by running containers + running_images=$(docker ps -q 2>/dev/null | xargs -r docker inspect --format='{{.Image}}' | sort -u) + all_images=$(docker images -q 2>/dev/null | sort -u) + for img in $all_images; do + if ! echo "$running_images" | grep -q "$img"; then + docker rmi -f "$img" 2>/dev/null || true + fi + done + log "Removed unused Docker images (aggressive)" + + # Prune volumes + docker volume prune -f 2>/dev/null || true + log "Pruned unused Docker volumes" +fi + +# --------------------------------------------------------------------------- +# Phase 3: act_runner action cache cleanup +# --------------------------------------------------------------------------- + +log "--- Action cache cleanup ---" + +if [ -d "$CACHE_DIR" ]; then + before=$(du -sb "$CACHE_DIR" 2>/dev/null | cut -f1) + + # Delete cache entries older than max age + find "$CACHE_DIR" -type f -mtime "+${CACHE_MAX_AGE_DAYS}" -delete 2>/dev/null || true + find "$CACHE_DIR" -type d -empty -delete 2>/dev/null || true + + after=$(du -sb "$CACHE_DIR" 2>/dev/null | cut -f1) + freed=$((before - after)) + log "Cache cleanup freed $(bytes_to_human $freed) (entries older than ${CACHE_MAX_AGE_DAYS}d)" +else + log "Cache directory not found: ${CACHE_DIR}" + + # Try common alternative locations + for alt in /var/lib/act_runner/.cache/actcache /home/*/.cache/actcache; do + if [ -d "$alt" ]; then + log "Found cache at: $alt (update CACHE_DIR config)" + CACHE_DIR="$alt" + find "$CACHE_DIR" -type f -mtime "+${CACHE_MAX_AGE_DAYS}" -delete 2>/dev/null || true + find "$CACHE_DIR" -type d -empty -delete 2>/dev/null || true + break + fi + done +fi + +# --------------------------------------------------------------------------- +# Phase 4: Workspace cleanup +# --------------------------------------------------------------------------- + +log "--- Workspace cleanup ---" + +for ws in "${WORKSPACES[@]}"; do + if [ -d "$ws" ]; then + # Remove workspace dirs not modified in the last 2 hours + # (active builds should be touching files continuously) + before=$(du -sb "$ws" 2>/dev/null | cut -f1) + find "$ws" -mindepth 1 -maxdepth 1 -type d -mmin +120 -exec rm -rf {} + 2>/dev/null || true + after=$(du -sb "$ws" 2>/dev/null | cut -f1) + freed=$((before - after)) + if [ "$freed" -gt 0 ]; then + log "Workspace $ws: freed $(bytes_to_human $freed)" + fi + fi +done + +# --------------------------------------------------------------------------- +# Phase 5: System cleanup +# --------------------------------------------------------------------------- + +log "--- System cleanup ---" + +# apt cache +apt-get clean 2>/dev/null || true + +# Truncate large log files (keep last 1000 lines) +for logfile in /var/log/syslog /var/log/daemon.log /var/log/kern.log; do + if [ -f "$logfile" ] && [ "$(stat -c%s "$logfile" 2>/dev/null)" -gt 104857600 ]; then + tail -1000 "$logfile" > "${logfile}.tmp" && mv "${logfile}.tmp" "$logfile" + log "Truncated $logfile (was >100MB)" + fi +done + +# Journal logs older than 3 days +journalctl --vacuum-time=3d 2>/dev/null || true + +# --------------------------------------------------------------------------- +# Phase 6: Emergency cleanup (only if still critical) +# --------------------------------------------------------------------------- + +usage=$(disk_usage_pct) +if [ "$usage" -gt 95 ]; then + log "CRITICAL: Still at ${usage}% after cleanup" + + # Nuclear option: remove ALL docker data except running containers + docker system prune -af --volumes 2>/dev/null || true + log "Ran docker system prune -af --volumes" + + # Clear entire action cache + if [ -d "$CACHE_DIR" ]; then + rm -rf "${CACHE_DIR:?}/"* + log "Cleared entire action cache" + fi + + usage=$(disk_usage_pct) + log "After emergency cleanup: ${usage}%" +fi + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- + +usage=$(disk_usage_pct) +log "Cleanup complete. Disk usage: ${usage}%" + +# Report top space consumers for diagnostics +log "Top 10 directories under /var:" +du -sh /var/*/ 2>/dev/null | sort -rh | head -10 | while read -r line; do + log " $line" +done diff --git a/.gitea/runner/cleanup.timer b/.gitea/runner/cleanup.timer new file mode 100644 index 0000000000..9f646d8d84 --- /dev/null +++ b/.gitea/runner/cleanup.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Run CI runner cleanup every 30 minutes + +[Timer] +OnBootSec=5min +OnUnitActiveSec=30min +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml index f2d0cd0963..2a13fb4765 100644 --- a/.gitea/workflows/build.yml +++ b/.gitea/workflows/build.yml @@ -22,6 +22,17 @@ jobs: DEBIAN_FRONTEND: noninteractive steps: + - name: Free disk space + run: | + echo "=== Disk usage before cleanup ===" + df -h / + rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache 2>/dev/null || true + rm -rf /usr/local/share/boost /usr/share/swift 2>/dev/null || true + apt-get autoremove -y 2>/dev/null || true + apt-get clean 2>/dev/null || true + echo "=== Disk usage after cleanup ===" + df -h / + - name: Install system prerequisites run: | apt-get update -qq @@ -46,12 +57,16 @@ jobs: export PATH="$HOME/.pixi/bin:$PATH" pixi --version + - name: Compute cache date key + id: cache-date + run: echo "date=$(date -u +%Y%m%d)" >> $GITHUB_OUTPUT + - name: Restore ccache id: ccache-restore uses: https://github.com/actions/cache/restore@v4 with: path: /tmp/ccache-kindred-create - key: ccache-build-${{ github.ref_name }}-${{ github.run_id }} + key: ccache-build-${{ github.ref_name }}-${{ steps.cache-date.outputs.date }} restore-keys: | ccache-build-${{ github.ref_name }}- ccache-build-main- @@ -60,6 +75,7 @@ jobs: run: | mkdir -p $CCACHE_DIR pixi run ccache -z + pixi run ccache -p - name: Configure (CMake) run: pixi run cmake --preset conda-linux-release @@ -71,11 +87,11 @@ jobs: run: pixi run ccache -s - name: Save ccache - if: always() + if: always() && steps.ccache-restore.outputs.cache-hit != 'true' uses: https://github.com/actions/cache/save@v4 with: path: /tmp/ccache-kindred-create - key: ccache-build-${{ github.ref_name }}-${{ github.run_id }} + key: ccache-build-${{ github.ref_name }}-${{ steps.cache-date.outputs.date }} - name: Run C++ unit tests continue-on-error: true diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 61fef9e681..50db6e8c0d 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -67,20 +67,26 @@ jobs: export PATH="$HOME/.pixi/bin:$PATH" pixi --version + - name: Compute cache date key + id: cache-date + run: echo "date=$(date -u +%Y%m%d)" >> $GITHUB_OUTPUT + - name: Restore ccache id: ccache-restore uses: https://github.com/actions/cache/restore@v4 with: path: /tmp/ccache-kindred-create - key: ccache-release-linux-${{ github.run_id }} + key: ccache-release-linux-${{ steps.cache-date.outputs.date }} restore-keys: | ccache-release-linux- - ccache-build-main- - name: Prepare ccache run: | mkdir -p $CCACHE_DIR + # Ensure ccache is accessible to rattler-build's subprocess + export PATH="$(pixi run bash -c 'echo $PATH')" pixi run ccache -z + pixi run ccache -p - name: Build release package (AppImage) working-directory: package/rattler-build @@ -92,11 +98,11 @@ jobs: run: pixi run ccache -s - name: Save ccache - if: always() + if: always() && steps.ccache-restore.outputs.cache-hit != 'true' uses: https://github.com/actions/cache/save@v4 with: path: /tmp/ccache-kindred-create - key: ccache-release-linux-${{ github.run_id }} + key: ccache-release-linux-${{ steps.cache-date.outputs.date }} - name: Clean up intermediate build files run: | diff --git a/package/rattler-build/build.sh b/package/rattler-build/build.sh index 7183630d70..e143c98acf 100644 --- a/package/rattler-build/build.sh +++ b/package/rattler-build/build.sh @@ -1,3 +1,15 @@ +# Configure ccache to use a shared cache directory that persists across CI runs. +# The workflow caches /tmp/ccache-kindred-create between builds. +export CCACHE_DIR="${CCACHE_DIR:-/tmp/ccache-kindred-create}" +export CCACHE_BASEDIR="${SRC_DIR:-$(pwd)}" +export CCACHE_COMPRESS="${CCACHE_COMPRESS:-true}" +export CCACHE_COMPRESSLEVEL="${CCACHE_COMPRESSLEVEL:-6}" +export CCACHE_MAXSIZE="${CCACHE_MAXSIZE:-4G}" +export CCACHE_SLOPPINESS="${CCACHE_SLOPPINESS:-include_file_ctime,include_file_mtime,pch_defines,time_macros}" +mkdir -p "$CCACHE_DIR" +echo "ccache config: CCACHE_DIR=$CCACHE_DIR CCACHE_BASEDIR=$CCACHE_BASEDIR" +ccache -z || true + if [[ ${HOST} =~ .*linux.* ]]; then CMAKE_PRESET=conda-linux-release fi @@ -46,3 +58,6 @@ cmake --install build mv ${PREFIX}/bin/FreeCAD ${PREFIX}/bin/freecad || true mv ${PREFIX}/bin/FreeCADCmd ${PREFIX}/bin/freecadcmd || true + +echo "=== ccache statistics ===" +ccache -s || true