2025ML-project-neural_compr.../config/download_datasets.sh

#!/usr/bin/env bash
# Download all URLs (one per line) from a txt file into a destination directory.
# This script is written by Copilot

set -uo pipefail

usage() {
  echo "Usage: $0 <urls.txt> <destination_dir>"
  echo "Example: $0 urls.txt ~/Downloads/files"
  exit 1
}

# ---- Args & prerequisites ----
[[ $# -ne 2 ]] && usage

URLS_FILE="$1"
DEST_DIR="$2"

if [[ ! -f "$URLS_FILE" ]]; then
  echo "Error: URL list file not found: $URLS_FILE" >&2
  exit 2
fi

mkdir -p "$DEST_DIR" || {
  echo "Error: Cannot create/access destination directory: $DEST_DIR" >&2
  exit 3
}

# Prefer curl if available; otherwise try wget
DOWNLOADER=""
if command -v wget >/dev/null 2>&1; then
  DOWNLOADER="wget"
else
  echo "Error: Neither 'curl' nor 'wget' found. Please install one." >&2
  exit 4
fi

echo "Using downloader: $DOWNLOADER"
echo "Reading URLs from: $URLS_FILE"
echo "Saving to:        $DEST_DIR"
echo

# ---- Download loop ----
# Reads lines including the last one even if it lacks a trailing newline.
while IFS= read -r url || [[ -n "$url" ]]; do
  # Skip empty lines and comments
  [[ -z "$url" ]] && continue
  [[ "$url" =~ ^[[:space:]]*# ]] && continue

  # Optional: strip leading/trailing whitespace
  url="$(printf '%s' "$url" | awk '{$1=$1;print}')"

  # Basic scheme check
  if ! [[ "$url" =~ ^https?:// ]]; then
    echo "Skipping (invalid URL scheme): $url" >&2
    continue
  fi

  echo "→ Downloading: $url"

  if [[ "$DOWNLOADER" == "curl" ]]; then
    # -f   fail on HTTP errors
    # -L   follow redirects
    # -C - resume if possible
    # --retry 3 retry transient failures
    # -OJ  save using server-provided filename (Content-Disposition) if present
    # (cd to dest so curl -O/-OJ writes there)
    (
      cd "$DEST_DIR" && \
      curl -fL -C - --retry 3 --remote-header-name -OJ "$url"
    ) || {
      echo "  ⚠️  Failed: $url" >&2
    }
  else
    # wget:
    # --content-disposition: respect server-provided filename
    # --tries=3, --timeout=10: retry/transient handling
    # --directory-prefix: write to dest
    # --no-clobber: skip file if it already exists
    wget -q --content-disposition --tries=3 --timeout=10 \
         --directory-prefix="$DEST_DIR" --no-clobber "$url" || {
      echo "  ⚠️  Failed: $url" >&2
    }
  fi

  # Extract .gz files
  if [[ "$url" =~ \.gz$ ]]; then
    filename="${url##*/}"
    echo "Extracting: $filename"
    gunzip "$DEST_DIR/${filename}"
  fi
done < "$URLS_FILE"

echo
echo "✅ Done. Files saved in: $DEST_DIR"