This repository has been archived on 2025-12-23. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
2025ML-project-neural_compr.../config/download_datasets.sh
2025-12-14 18:36:40 +01:00

95 lines
2.4 KiB
Bash
Executable file

#!/usr/bin/env bash
# Download all URLs (one per line) from a txt file into a destination directory.
# This script is written by Copilot
set -uo pipefail
usage() {
echo "Usage: $0 <urls.txt> <destination_dir>"
echo "Example: $0 urls.txt ~/Downloads/files"
exit 1
}
# ---- Args & prerequisites ----
[[ $# -ne 2 ]] && usage
URLS_FILE="$1"
DEST_DIR="$2"
if [[ ! -f "$URLS_FILE" ]]; then
echo "Error: URL list file not found: $URLS_FILE" >&2
exit 2
fi
mkdir -p "$DEST_DIR" || {
echo "Error: Cannot create/access destination directory: $DEST_DIR" >&2
exit 3
}
# Prefer curl if available; otherwise try wget
DOWNLOADER=""
if command -v wget >/dev/null 2>&1; then
DOWNLOADER="wget"
else
echo "Error: Neither 'curl' nor 'wget' found. Please install one." >&2
exit 4
fi
echo "Using downloader: $DOWNLOADER"
echo "Reading URLs from: $URLS_FILE"
echo "Saving to: $DEST_DIR"
echo
# ---- Download loop ----
# Reads lines including the last one even if it lacks a trailing newline.
while IFS= read -r url || [[ -n "$url" ]]; do
# Skip empty lines and comments
[[ -z "$url" ]] && continue
[[ "$url" =~ ^[[:space:]]*# ]] && continue
# Optional: strip leading/trailing whitespace
url="$(printf '%s' "$url" | awk '{$1=$1;print}')"
# Basic scheme check
if ! [[ "$url" =~ ^https?:// ]]; then
echo "Skipping (invalid URL scheme): $url" >&2
continue
fi
echo "→ Downloading: $url"
if [[ "$DOWNLOADER" == "curl" ]]; then
# -f fail on HTTP errors
# -L follow redirects
# -C - resume if possible
# --retry 3 retry transient failures
# -OJ save using server-provided filename (Content-Disposition) if present
# (cd to dest so curl -O/-OJ writes there)
(
cd "$DEST_DIR" && \
curl -fL -C - --retry 3 --remote-header-name -OJ "$url"
) || {
echo " ⚠️ Failed: $url" >&2
}
else
# wget:
# --content-disposition: respect server-provided filename
# --tries=3, --timeout=10: retry/transient handling
# --directory-prefix: write to dest
# --no-clobber: skip file if it already exists
wget -q --content-disposition --tries=3 --timeout=10 \
--directory-prefix="$DEST_DIR" --no-clobber "$url" || {
echo " ⚠️ Failed: $url" >&2
}
fi
# Extract .gz files
if [[ "$url" =~ \.gz$ ]]; then
filename="${url##*/}"
echo "Extracting: $filename"
gunzip "$DEST_DIR/${filename}"
fi
done < "$URLS_FILE"
echo
echo "✅ Done. Files saved in: $DEST_DIR"