#!/usr/bin/env bash
#
# Atomicorp Container SBOM Collector
# Copyright (C) 2026 Atomicorp
#
# Collects inventory or SBOM data for running Docker containers by image ID,
# with local caching and status events for unchanged/removed images.
#

set -euo pipefail

# =========================
# Version
# =========================
SCRIPT_VERSION="2.1.0"

# =========================
# Defaults
# =========================
SOCK="/var/run/docker.sock"

OUTPUT_MODE="inventory"              # inventory|sbom
SBOM_FORMAT="cyclonedx-json@1.6"     # used when --mode sbom
EMIT_META="no"                       # yes|no (sbom mode only)

LOG_LEVEL="info"                     # debug|info|warn|error
LOG_FILE=""

NICE_LEVEL="10"                      # 0..19
IONICE_CLASS=""                      # 2(best-effort) or 3(idle)
IONICE_LEVEL="7"                     # 0..7

DOCKER_HTTP_TIMEOUT="15"
SYFT_TIMEOUT="600"

LOCK_FILE="/tmp/container-sbom.lock"

MAX_IMAGES_PER_RUN="100"
MAX_CONTAINERS_PER_RUN="500"
MAX_SYFT_JSON_BYTES="52428800"       # 50 MiB
MAX_STDOUT_BYTES="104857600"         # 100 MiB

FAIL_ON_PARTIAL="no"                 # yes|no

CACHE_DIR="/var/awp/cache/container-sbom"
CACHE_TTL_SECONDS="0"                # 0 = disabled
RESCAN="no"                          # yes|no
STATUS_ONLY_ON_UNCHANGED="yes"       # yes|no

ALLOW_IMAGE_ID_PREFIXES="docker-pullable://,sha256:"

# =========================
# Usage
# =========================
usage() {
  cat <<'EOF'
Usage:
  container-sbom-collect scan docker://running [options]
  container-sbom-collect --version

Commands:
  scan                    Run a scan

Targets:
  docker://running        Enumerate running Docker containers, de-duplicate by image ID,
                          and scan each unique image once

Options:
  --mode MODE             inventory | sbom
                          Default: inventory

  --sbom-format FORMAT    cyclonedx-json@1.6 | spdx-json@2.3 | syft-json |
                          cyclonedx-json | cyclonedx-xml | spdx-json | spdx-tag-value
                          Default: cyclonedx-json@1.6

  --emit-meta YESNO       yes | no
                          Emit metadata before SBOM output in sbom mode
                          Default: no

  --docker-sock PATH      Docker Unix socket path
                          Allowed: /var/run/docker.sock or /run/docker.sock
                          Default: /var/run/docker.sock

  --log-level LEVEL       debug | info | warn | error
                          Default: info

  --log-file PATH         Optional log file path

  --nice-level N          0..19
                          Default: 10

  --ionice-class N        2 | 3
                          Optional I/O priority class

  --ionice-level N        0..7
                          Default: 7

  --docker-timeout SEC    Timeout for Docker API calls
                          Default: 15

  --syft-timeout SEC      Timeout per Syft run
                          Default: 600

  --lock-file PATH        Lock file path
                          Default: /tmp/container-sbom.lock

  --max-images N          Max unique images per run
                          Default: 100

  --max-containers N      Max running containers per run
                          Default: 500

  --max-syft-json-bytes N Max allowed Syft output size per image
                          Default: 52428800

  --max-stdout-bytes N    Max total bytes emitted by this run
                          Default: 104857600

  --fail-on-partial YESNO yes | no
                          Default: no

  --cache-dir PATH        Cache directory
                          Default: /var/awp/cache/container-sbom

  --cache-ttl-seconds N   0 disables TTL invalidation
                          Default: 0

  --rescan YESNO          Force rescan and bypass cache
                          Default: no

  --status-only-on-unchanged YESNO
                          Emit only status/presence for unchanged cached images
                          Default: yes

  --version               Show script version and exit
  --help                  Show this help
EOF
}

# =========================
# Interface
# =========================
if [[ "${1:-}" == "--version" ]]; then
  echo "$SCRIPT_VERSION"
  exit 0
fi

CMD="${1:-}"
TARGET="${2:-}"

if [[ -z "$CMD" || "$CMD" == "--help" || "$CMD" == "-h" ]]; then
  usage
  exit 0
fi

if [[ "$CMD" != "scan" ]]; then
  echo "ERROR: Unsupported command '$CMD' (expected: scan)" >&2
  exit 64
fi

if [[ -z "$TARGET" ]]; then
  echo "ERROR: Missing target" >&2
  usage >&2
  exit 64
fi

shift 2

# =========================
# Parse CLI options
# =========================
while [[ $# -gt 0 ]]; do
  case "$1" in
    --mode) OUTPUT_MODE="${2:-}"; shift 2 ;;
    --sbom-format) SBOM_FORMAT="${2:-}"; shift 2 ;;
    --emit-meta) EMIT_META="${2:-}"; shift 2 ;;
    --docker-sock) SOCK="${2:-}"; shift 2 ;;
    --log-level) LOG_LEVEL="${2:-}"; shift 2 ;;
    --log-file) LOG_FILE="${2:-}"; shift 2 ;;
    --nice-level) NICE_LEVEL="${2:-}"; shift 2 ;;
    --ionice-class) IONICE_CLASS="${2:-}"; shift 2 ;;
    --ionice-level) IONICE_LEVEL="${2:-}"; shift 2 ;;
    --docker-timeout) DOCKER_HTTP_TIMEOUT="${2:-}"; shift 2 ;;
    --syft-timeout) SYFT_TIMEOUT="${2:-}"; shift 2 ;;
    --lock-file) LOCK_FILE="${2:-}"; shift 2 ;;
    --max-images) MAX_IMAGES_PER_RUN="${2:-}"; shift 2 ;;
    --max-containers) MAX_CONTAINERS_PER_RUN="${2:-}"; shift 2 ;;
    --max-syft-json-bytes) MAX_SYFT_JSON_BYTES="${2:-}"; shift 2 ;;
    --max-stdout-bytes) MAX_STDOUT_BYTES="${2:-}"; shift 2 ;;
    --fail-on-partial) FAIL_ON_PARTIAL="${2:-}"; shift 2 ;;
    --cache-dir) CACHE_DIR="${2:-}"; shift 2 ;;
    --cache-ttl-seconds) CACHE_TTL_SECONDS="${2:-}"; shift 2 ;;
    --rescan) RESCAN="${2:-}"; shift 2 ;;
    --status-only-on-unchanged) STATUS_ONLY_ON_UNCHANGED="${2:-}"; shift 2 ;;
    --version) echo "$SCRIPT_VERSION"; exit 0 ;;
    --help|-h) usage; exit 0 ;;
    *)
      echo "ERROR: Unknown option '$1'" >&2
      usage >&2
      exit 64
      ;;
  esac
done

# =========================
# Validation
# =========================
die() { echo "ERROR: $*" >&2; exit 1; }

case "$TARGET" in docker://running) : ;; *) die "Unsupported target '$TARGET'";; esac
case "$OUTPUT_MODE" in inventory|sbom) : ;; *) die "Invalid --mode=$OUTPUT_MODE";; esac
case "$EMIT_META" in yes|no) : ;; *) die "Invalid --emit-meta=$EMIT_META";; esac
case "$FAIL_ON_PARTIAL" in yes|no) : ;; *) die "Invalid --fail-on-partial=$FAIL_ON_PARTIAL";; esac
case "$RESCAN" in yes|no) : ;; *) die "Invalid --rescan=$RESCAN";; esac
case "$STATUS_ONLY_ON_UNCHANGED" in yes|no) : ;; *) die "Invalid --status-only-on-unchanged=$STATUS_ONLY_ON_UNCHANGED";; esac
case "$LOG_LEVEL" in debug|info|warn|error) : ;; *) die "Invalid --log-level=$LOG_LEVEL";; esac

[[ "$NICE_LEVEL" =~ ^[0-9]+$ ]] || die "Invalid --nice-level=$NICE_LEVEL"
(( NICE_LEVEL >= 0 && NICE_LEVEL <= 19 )) || die "Invalid --nice-level range (0..19)"

if [[ -n "$IONICE_CLASS" ]]; then
  [[ "$IONICE_CLASS" =~ ^[0-9]+$ ]] || die "Invalid --ionice-class=$IONICE_CLASS"
  case "$IONICE_CLASS" in
    2|3) : ;;
    *) die "Invalid --ionice-class (allowed 2 or 3)" ;;
  esac
  [[ "$IONICE_LEVEL" =~ ^[0-9]+$ ]] || die "Invalid --ionice-level=$IONICE_LEVEL"
  (( IONICE_LEVEL >= 0 && IONICE_LEVEL <= 7 )) || die "Invalid --ionice-level range (0..7)"
fi

for n in DOCKER_HTTP_TIMEOUT SYFT_TIMEOUT MAX_IMAGES_PER_RUN MAX_CONTAINERS_PER_RUN MAX_SYFT_JSON_BYTES MAX_STDOUT_BYTES CACHE_TTL_SECONDS; do
  v="${!n}"
  [[ "$v" =~ ^[0-9]+$ ]] || die "Invalid numeric option: $n=$v"
done

case "$SOCK" in
  /var/run/docker.sock|/run/docker.sock) : ;;
  *) die "Refusing --docker-sock=$SOCK (allowed: /var/run/docker.sock or /run/docker.sock)" ;;
esac

case "$SBOM_FORMAT" in
  cyclonedx-json|cyclonedx-xml|spdx-json|spdx-tag-value|syft-json|cyclonedx-json@1.6|spdx-json@2.3) : ;;
  *) die "Invalid --sbom-format=$SBOM_FORMAT" ;;
esac

# =========================
# Logging
# =========================
ts() { date -u +"%Y-%m-%dT%H:%M:%SZ"; }

_should_log() {
  local want="$1"
  case "$LOG_LEVEL" in
    debug) return 0 ;;
    info)  [[ "$want" != "debug" ]] ;;
    warn)  [[ "$want" == "warn" || "$want" == "error" ]] ;;
    error) [[ "$want" == "error" ]] ;;
  esac
}

log() {
  local lvl="$1"; shift
  _should_log "$lvl" || return 0
  local msg="[$(ts)] [$lvl] $*"
  echo "$msg" >&2
  if [[ -n "$LOG_FILE" ]]; then
    echo "$msg" >> "$LOG_FILE" 2>/dev/null || true
  fi
}

on_err() {
  local ec=$?
  log error "Collector failed (exit=$ec) at line ${BASH_LINENO[0]}: ${BASH_COMMAND}"
  exit "$ec"
}
trap on_err ERR

# =========================
# Requirements
# =========================
require_cmd() { command -v "$1" >/dev/null 2>&1 || die "Missing required command: $1"; }
require_cmd curl
require_cmd jq
require_cmd syft
require_cmd timeout
require_cmd flock
require_cmd wc
require_cmd mktemp
require_cmd sha256sum
require_cmd stat
require_cmd mkdir
require_cmd rm
require_cmd mv
require_cmd cat
require_cmd head

# =========================
# Lock
# =========================
exec 9>"$LOCK_FILE" || die "Unable to open lock file: $LOCK_FILE"
if ! flock -n 9; then
  log warn "Another collector instance is running; exiting"
  exit 0
fi

# =========================
# Globals / Paths
# =========================
declare -i emitted_bytes=0
declare -i skipped_images=0
declare -i failed_images=0
declare -i processed_images=0
declare -i cache_hits=0
declare -i cache_misses=0

CACHE_IMAGES_DIR="${CACHE_DIR}/images"
STATE_FILE="${CACHE_DIR}/state.json"
mkdir -p "$CACHE_IMAGES_DIR"

TMPDIR_RUN="$(mktemp -d /tmp/container-sbom.XXXXXX)"
cleanup() { rm -rf "$TMPDIR_RUN"; }
trap cleanup EXIT

# =========================
# Helpers
# =========================
safe_emit_file() {
  local file="$1"
  local bytes
  bytes=$(wc -c < "$file" | tr -d ' ')
  emitted_bytes=$((emitted_bytes + bytes))
  if (( emitted_bytes > MAX_STDOUT_BYTES )); then
    die "Refusing to emit more than --max-stdout-bytes=$MAX_STDOUT_BYTES"
  fi
  cat "$file"
  printf '\n'
}

safe_emit_string() {
  local payload="$1"
  local bytes
  bytes=$(printf '%s' "$payload" | wc -c | tr -d ' ')
  emitted_bytes=$((emitted_bytes + bytes))
  if (( emitted_bytes > MAX_STDOUT_BYTES )); then
    die "Refusing to emit more than --max-stdout-bytes=$MAX_STDOUT_BYTES"
  fi
  printf '%s\n' "$payload"
}

docker_get() {
  local path="$1"
  timeout "$DOCKER_HTTP_TIMEOUT" \
    curl -sS --fail --unix-socket "$SOCK" "http://localhost${path}"
}

run_syft_to_file() {
  local outfile="$1"
  shift
  if [[ -n "$IONICE_CLASS" ]] && command -v ionice >/dev/null 2>&1; then
    timeout "$SYFT_TIMEOUT" ionice -c "$IONICE_CLASS" -n "$IONICE_LEVEL" \
      nice -n "$NICE_LEVEL" syft -q "$@" > "$outfile"
  else
    timeout "$SYFT_TIMEOUT" nice -n "$NICE_LEVEL" syft -q "$@" > "$outfile"
  fi
}

syft_json_to_inventory_packages_file() {
  local infile="$1"
  local outfile="$2"
  jq -c '
    (.artifacts // [])
    | map({
        name: .name,
        version: .version,
        type: (.type // .packageType // null),
        purl: (.purl // null),
        cpes: (.cpes // []),
        licenses: ((.licenses // []) | map(.value? // .)),
        locations: ((.locations // []) | map({
          path: (.path // null),
          layerID: (.layerID // null)
        }))
      })
  ' "$infile" > "$outfile"
}

is_allowed_image_id() {
  local imageid="$1"
  local prefix
  IFS=',' read -r -a prefixes <<< "$ALLOW_IMAGE_ID_PREFIXES"
  for prefix in "${prefixes[@]}"; do
    [[ "$imageid" == "$prefix"* ]] && return 0
  done
  return 1
}

normalize_image_id() {
  local imageid="$1"
  case "$imageid" in
    docker-pullable://*) printf '%s' "${imageid#docker-pullable://}" ;;
    sha256:*) printf '%s' "$imageid" ;;
    *) printf '%s' "$imageid" ;;
  esac
}

sanitize_image_id_for_filename() {
  local imageid="$1"
  imageid="${imageid#docker-pullable://}"
  imageid="${imageid//\//_}"
  imageid="${imageid//:/_}"
  printf '%s' "$imageid"
}

syft_version() {
  syft --version 2>/dev/null | tr -d '\r'
}

output_affecting_options_fingerprint() {
  local s
  if [[ "$OUTPUT_MODE" == "inventory" ]]; then
    s="mode=inventory"
  else
    s="mode=sbom|sbom_format=${SBOM_FORMAT}|emit_meta=${EMIT_META}"
  fi
  printf '%s' "$s" | sha256sum | awk '{print $1}'
}

cache_file_for_image() {
  local imageid="$1"
  printf '%s/%s.json' "$CACHE_IMAGES_DIR" "$(sanitize_image_id_for_filename "$imageid")"
}

cache_is_expired() {
  local cache_file="$1"
  if (( CACHE_TTL_SECONDS == 0 )); then
    return 1
  fi

  local now mtime age
  now="$(date +%s)"
  mtime="$(stat -c %Y "$cache_file" 2>/dev/null || echo 0)"
  age=$(( now - mtime ))

  (( age > CACHE_TTL_SECONDS ))
}

cache_valid_for_image() {
  local cache_file="$1"
  local imageid="$2"
  local options_fp="$3"
  local syft_ver="$4"

  [[ -f "$cache_file" ]] || return 1
  cache_is_expired "$cache_file" && return 1

  jq -e \
    --arg image_id "$imageid" \
    --arg mode "$OUTPUT_MODE" \
    --arg sbom_format "$SBOM_FORMAT" \
    --arg syft_version "$syft_ver" \
    --arg script_version "$SCRIPT_VERSION" \
    --arg options_fingerprint "$options_fp" '
      .image_id == $image_id and
      .mode == $mode and
      .syft_version == $syft_version and
      .script_version == $script_version and
      .options_fingerprint == $options_fingerprint and
      (
        ($mode == "inventory") or
        ($mode == "sbom" and .sbom_format == $sbom_format)
      )
    ' "$cache_file" >/dev/null 2>&1
}

should_fail_run() {
  [[ "$FAIL_ON_PARTIAL" == "yes" ]] && [[ "$failed_images" -gt 0 || "$skipped_images" -gt 0 ]]
}

write_state_file() {
  local current_state_file="$1"
  mv -f "$current_state_file" "$STATE_FILE"
}

previous_state_exists() {
  [[ -f "$STATE_FILE" ]]
}

emit_removed_events() {
  local prev_state="$1"
  local curr_state="$2"

  [[ -f "$prev_state" ]] || return 0

  jq -n -c --slurpfile prev "$prev_state" --slurpfile curr "$curr_state" --arg script_version "$SCRIPT_VERSION" '
    ($prev[0].images // {}) as $p
    | ($curr[0].images // {}) as $c
    | ($p | keys[]) as $k
    | select(($c[$k] // null) == null)
    | {
        type: "container_inventory_status",
        runtime: "docker",
        status: "removed",
        image_id: $k,
        collected_at: (now | todateiso8601),
        scanner_version: $script_version
      }
  ' | while IFS= read -r line; do
    [[ -n "$line" ]] && safe_emit_string "$line"
  done
}

emit_status_event() {
  local imageid="$1"
  local containers_file="$2"
  local status="$3"
  local outfile="$TMPDIR_RUN/status-$(sanitize_image_id_for_filename "$imageid").json"

  jq -n -c \
    --arg image_id "$imageid" \
    --arg collected_at "$(ts)" \
    --arg status "$status" \
    --arg script_version "$SCRIPT_VERSION" \
    --slurpfile containers "$containers_file" \
    '{
      type: "container_inventory_status",
      runtime: "docker",
      status: $status,
      image_id: $image_id,
      collected_at: $collected_at,
      scanner_version: $script_version,
      containers: $containers[0]
    }' > "$outfile"

  safe_emit_file "$outfile"
}

# =========================
# Main
# =========================
[[ -S "$SOCK" ]] || die "Docker socket not found at $SOCK"

CURRENT_SYFT_VERSION="$(syft_version)"
OPTIONS_FP="$(output_affecting_options_fingerprint)"

log info "Starting container collector v${SCRIPT_VERSION} (mode=$OUTPUT_MODE target=$TARGET)"
log debug "SOCK=$SOCK OUTPUT_MODE=$OUTPUT_MODE SBOM_FORMAT=$SBOM_FORMAT EMIT_META=$EMIT_META NICE_LEVEL=$NICE_LEVEL IONICE_CLASS=${IONICE_CLASS:-<disabled>} IONICE_LEVEL=$IONICE_LEVEL SYFT_PARALLELISM=${SYFT_PARALLELISM:-<default>} DOCKER_HTTP_TIMEOUT=$DOCKER_HTTP_TIMEOUT SYFT_TIMEOUT=$SYFT_TIMEOUT MAX_IMAGES_PER_RUN=$MAX_IMAGES_PER_RUN MAX_CONTAINERS_PER_RUN=$MAX_CONTAINERS_PER_RUN MAX_SYFT_JSON_BYTES=$MAX_SYFT_JSON_BYTES MAX_STDOUT_BYTES=$MAX_STDOUT_BYTES CACHE_DIR=$CACHE_DIR CACHE_TTL_SECONDS=$CACHE_TTL_SECONDS RESCAN=$RESCAN STATUS_ONLY_ON_UNCHANGED=$STATUS_ONLY_ON_UNCHANGED"

log info "Querying Docker for running containers"
containers_json_file="$TMPDIR_RUN/containers.json"
docker_get "/containers/json" > "$containers_json_file"
jq -e 'type=="array"' "$containers_json_file" >/dev/null || die "Docker response was not a JSON array"

count="$(jq 'length' "$containers_json_file")"
log info "Running containers: $count"

if (( count > MAX_CONTAINERS_PER_RUN )); then
  die "Refusing to process $count containers; --max-containers=$MAX_CONTAINERS_PER_RUN"
fi

if [[ "$count" -eq 0 ]]; then
  no_containers_file="$TMPDIR_RUN/no-containers.json"
  jq -n -c --arg script_version "$SCRIPT_VERSION" '{
    type:"container_inventory",
    runtime:"docker",
    message:"no running containers",
    scanner_version:$script_version
  }' > "$no_containers_file"
  safe_emit_file "$no_containers_file"

  current_state="$TMPDIR_RUN/state.new.json"
  jq -n --arg syft_version "$CURRENT_SYFT_VERSION" --arg script_version "$SCRIPT_VERSION" --arg options_fingerprint "$OPTIONS_FP" '{
    version:1,
    last_run:(now|todateiso8601),
    syft_version:$syft_version,
    script_version:$script_version,
    options_fingerprint:$options_fingerprint,
    images:{}
  }' > "$current_state"

  if previous_state_exists; then
    emit_removed_events "$STATE_FILE" "$current_state"
  fi

  write_state_file "$current_state"
  log info "No running containers; exiting"
  exit 0
fi

grouped_file="$TMPDIR_RUN/grouped.jsonl"
jq -c '
  map({
    image_id: .ImageID,
    containers: [{
      id: .Id,
      name: (.Names[0] | sub("^/";"")),
      image: .Image
    }]
  })
  | group_by(.image_id)
  | map({
      image_id: .[0].image_id,
      containers: (map(.containers[0]))
    })
  | .[]
' "$containers_json_file" > "$grouped_file"

unique_images="$(wc -l < "$grouped_file" | tr -d ' ')"
log info "Unique images in use: $unique_images"

if (( unique_images > MAX_IMAGES_PER_RUN )); then
  die "Refusing to process $unique_images images; --max-images=$MAX_IMAGES_PER_RUN"
fi

current_state="$TMPDIR_RUN/state.new.json"
jq -n --arg syft_version "$CURRENT_SYFT_VERSION" --arg script_version "$SCRIPT_VERSION" --arg options_fingerprint "$OPTIONS_FP" '{
  version:1,
  last_run:(now|todateiso8601),
  syft_version:$syft_version,
  script_version:$script_version,
  options_fingerprint:$options_fingerprint,
  images:{}
}' > "$current_state"

while IFS= read -r item; do
  imageid="$(jq -r '.image_id' <<< "$item")"
  containers_file="$TMPDIR_RUN/containers-$(printf '%s' "$RANDOM$RANDOM").json"
  jq -c '.containers' <<< "$item" > "$containers_file"

  if [[ -z "$imageid" || "$imageid" == "null" ]]; then
    log warn "Skipping empty/null image_id"
    skipped_images=$((skipped_images + 1))
    continue
  fi

  if (( ${#imageid} > 512 )); then
    log warn "Skipping suspiciously long image_id length=${#imageid}"
    skipped_images=$((skipped_images + 1))
    continue
  fi

  if ! is_allowed_image_id "$imageid"; then
    log warn "Skipping image_id with unexpected prefix: $imageid"
    skipped_images=$((skipped_images + 1))
    continue
  fi

  # Update current state for this image
  state_tmp="$TMPDIR_RUN/state.tmp.json"
  jq \
    --arg image_id "$imageid" \
    --slurpfile containers "$containers_file" \
    '.images[$image_id] = {containers: $containers[0]}' \
    "$current_state" > "$state_tmp"
  mv -f "$state_tmp" "$current_state"

  cache_file="$(cache_file_for_image "$imageid")"
  syft_target="docker:$(normalize_image_id "$imageid")"
  processed_images=$((processed_images + 1))

  cache_ok="no"
  if [[ "$RESCAN" == "no" ]] && cache_valid_for_image "$cache_file" "$imageid" "$OPTIONS_FP" "$CURRENT_SYFT_VERSION"; then
    cache_ok="yes"
  fi

  if [[ "$cache_ok" == "yes" ]]; then
    cache_hits=$((cache_hits + 1))
    log info "Cache hit: image_id=$imageid"

    if [[ "$STATUS_ONLY_ON_UNCHANGED" == "yes" ]]; then
      emit_status_event "$imageid" "$containers_file" "unchanged"
    else
      payload_type="$(jq -r '.payload_type // "json"' "$cache_file")"
      if [[ "$OUTPUT_MODE" == "inventory" ]]; then
        payload_file="$TMPDIR_RUN/payload-$(sanitize_image_id_for_filename "$imageid").json"
        jq -c \
          --arg collected_at "$(ts)" \
          --arg script_version "$SCRIPT_VERSION" \
          --slurpfile containers "$containers_file" \
          '
            .payload
            | .collected_at = $collected_at
            | .scanner_version = $script_version
            | .containers = $containers[0]
          ' "$cache_file" > "$payload_file"
        safe_emit_file "$payload_file"
      else
        if [[ "$EMIT_META" == "yes" ]]; then
          meta_file="$TMPDIR_RUN/meta-$(sanitize_image_id_for_filename "$imageid").json"
          jq -n -c \
            --arg image_id "$imageid" \
            --arg collected_at "$(ts)" \
            --arg script_version "$SCRIPT_VERSION" \
            --slurpfile containers "$containers_file" \
            '{
              type:"container_image_meta",
              runtime:"docker",
              image_id:$image_id,
              collected_at:$collected_at,
              scanner_version:$script_version,
              containers:$containers[0]
            }' > "$meta_file"
          safe_emit_file "$meta_file"
        fi

        if [[ "$payload_type" == "json" ]]; then
          payload_file="$TMPDIR_RUN/payload-$(sanitize_image_id_for_filename "$imageid").json"
          jq -c '.payload' "$cache_file" > "$payload_file"
          safe_emit_file "$payload_file"
        else
          payload_file="$TMPDIR_RUN/payload-$(sanitize_image_id_for_filename "$imageid").txt"
          jq -r '.payload_text' "$cache_file" > "$payload_file"
          safe_emit_file "$payload_file"
        fi
      fi
    fi

    continue
  fi

  cache_misses=$((cache_misses + 1))
  log info "Cache miss: image_id=$imageid"

  if [[ "$OUTPUT_MODE" == "sbom" ]]; then
    sbom_file="$TMPDIR_RUN/sbom-${processed_images}.out"
    if ! run_syft_to_file "$sbom_file" "$syft_target" -o "$SBOM_FORMAT"; then
      log error "Syft failed for image_id=$imageid"
      failed_images=$((failed_images + 1))
      continue
    fi

    outbytes="$(wc -c < "$sbom_file" | tr -d ' ')"
    if (( outbytes > MAX_SYFT_JSON_BYTES )); then
      log error "SBOM output too large for image_id=$imageid bytes=$outbytes limit=$MAX_SYFT_JSON_BYTES"
      failed_images=$((failed_images + 1))
      continue
    fi

    payload_type="text"
    case "$SBOM_FORMAT" in
      syft-json|syft-json*)
        jq -e 'type=="object"' "$sbom_file" >/dev/null || {
          log error "Invalid syft-json output for image_id=$imageid"
          failed_images=$((failed_images + 1))
          continue
        }
        payload_type="json"
        ;;
      cyclonedx-json*|spdx-json*)
        jq -e 'type=="object"' "$sbom_file" >/dev/null || {
          log error "Invalid JSON SBOM output for image_id=$imageid"
          failed_images=$((failed_images + 1))
          continue
        }
        payload_type="json"
        ;;
    esac

    if [[ "$payload_type" == "json" ]]; then
      jq -n -c \
        --arg image_id "$imageid" \
        --arg mode "$OUTPUT_MODE" \
        --arg sbom_format "$SBOM_FORMAT" \
        --arg syft_version "$CURRENT_SYFT_VERSION" \
        --arg script_version "$SCRIPT_VERSION" \
        --arg options_fingerprint "$OPTIONS_FP" \
        --arg collected_at "$(ts)" \
        --slurpfile payload "$sbom_file" \
        '{
          version:1,
          image_id:$image_id,
          mode:$mode,
          sbom_format:$sbom_format,
          syft_version:$syft_version,
          script_version:$script_version,
          options_fingerprint:$options_fingerprint,
          collected_at:$collected_at,
          payload_type:"json",
          payload:$payload[0]
        }' > "$cache_file"
    else
      jq -n -c \
        --arg image_id "$imageid" \
        --arg mode "$OUTPUT_MODE" \
        --arg sbom_format "$SBOM_FORMAT" \
        --arg syft_version "$CURRENT_SYFT_VERSION" \
        --arg script_version "$SCRIPT_VERSION" \
        --arg options_fingerprint "$OPTIONS_FP" \
        --arg collected_at "$(ts)" \
        --rawfile payload_text "$sbom_file" \
        '{
          version:1,
          image_id:$image_id,
          mode:$mode,
          sbom_format:$sbom_format,
          syft_version:$syft_version,
          script_version:$script_version,
          options_fingerprint:$options_fingerprint,
          collected_at:$collected_at,
          payload_type:"text",
          payload_text:$payload_text
        }' > "$cache_file"
    fi

    if [[ "$EMIT_META" == "yes" ]]; then
      meta_file="$TMPDIR_RUN/meta-$(sanitize_image_id_for_filename "$imageid").json"
      jq -n -c \
        --arg image_id "$imageid" \
        --arg collected_at "$(ts)" \
        --arg script_version "$SCRIPT_VERSION" \
        --slurpfile containers "$containers_file" \
        '{
          type:"container_image_meta",
          runtime:"docker",
          image_id:$image_id,
          collected_at:$collected_at,
          scanner_version:$script_version,
          containers:$containers[0]
        }' > "$meta_file"
      safe_emit_file "$meta_file"
    fi

    safe_emit_file "$sbom_file"
    continue
  fi

  # inventory mode
  syft_json_file="$TMPDIR_RUN/syft-${processed_images}.json"
  packages_file="$TMPDIR_RUN/packages-${processed_images}.json"
  inventory_payload_file="$TMPDIR_RUN/inventory-${processed_images}.json"

  if ! run_syft_to_file "$syft_json_file" "$syft_target" -o syft-json; then
    log error "Syft failed for image_id=$imageid"
    failed_images=$((failed_images + 1))
    continue
  fi

  outbytes="$(wc -c < "$syft_json_file" | tr -d ' ')"
  if (( outbytes > MAX_SYFT_JSON_BYTES )); then
    log error "Syft JSON too large for image_id=$imageid bytes=$outbytes limit=$MAX_SYFT_JSON_BYTES"
    failed_images=$((failed_images + 1))
    continue
  fi

  jq -e 'type=="object" and (.artifacts | type=="array" or .artifacts == null)' "$syft_json_file" >/dev/null || {
    log error "Invalid syft-json structure for image_id=$imageid"
    failed_images=$((failed_images + 1))
    continue
  }

  syft_json_to_inventory_packages_file "$syft_json_file" "$packages_file"

  jq -n -c \
    --arg runtime "docker" \
    --arg image_id "$imageid" \
    --arg collected_at "$(ts)" \
    --arg script_version "$SCRIPT_VERSION" \
    --slurpfile containers "$containers_file" \
    --slurpfile packages "$packages_file" \
    '{
      type:"container_inventory",
      runtime:$runtime,
      image_id:$image_id,
      collected_at:$collected_at,
      scanner_version:$script_version,
      containers:$containers[0],
      packages:$packages[0]
    }' > "$inventory_payload_file"

  jq -n -c \
    --arg image_id "$imageid" \
    --arg mode "$OUTPUT_MODE" \
    --arg syft_version "$CURRENT_SYFT_VERSION" \
    --arg script_version "$SCRIPT_VERSION" \
    --arg options_fingerprint "$OPTIONS_FP" \
    --arg collected_at "$(ts)" \
    --slurpfile payload "$inventory_payload_file" \
    '{
      version:1,
      image_id:$image_id,
      mode:$mode,
      syft_version:$syft_version,
      script_version:$script_version,
      options_fingerprint:$options_fingerprint,
      collected_at:$collected_at,
      payload_type:"json",
      payload:$payload[0]
    }' > "$cache_file"

  safe_emit_file "$inventory_payload_file"

done < "$grouped_file"

if previous_state_exists; then
  emit_removed_events "$STATE_FILE" "$current_state"
fi

write_state_file "$current_state"

log info "Collector completed successfully processed_images=$processed_images cache_hits=$cache_hits cache_misses=$cache_misses failed_images=$failed_images skipped_images=$skipped_images emitted_bytes=$emitted_bytes"

if should_fail_run; then
  die "Run completed partially: failed_images=$failed_images skipped_images=$skipped_images"
fi
