|
| 1 | +#!/bin/bash |
| 2 | +# Script to generate Azure Blob Storage SAS URLs for image files in JSONL format |
| 3 | +# requires az cli installed and logged in https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest |
| 4 | +# Usage: ./generateAzureSasUrls.sh <container-url> [output-file] [expiration-hours] [parallel-jobs] |
| 5 | +# Example: ./generateAzureSasUrls.sh https://myaccount.blob.core.windows.net/mycontainer output.jsonl 6 8 |
| 6 | +# Or with curl: |
| 7 | +# curl -fsSL https://raw.githubusercontent.com/roboflow/roboflow-python/main/scripts/generateAzureSasUrls.sh | bash -s -- https://myaccount.blob.core.windows.net/mycontainer output.jsonl |
| 8 | + |
| 9 | +set -e |
| 10 | + |
| 11 | +# Check if container URL is provided |
| 12 | +if [ -z "$1" ]; then |
| 13 | + echo "Error: Azure container URL is required" |
| 14 | + echo "Usage: $0 <container-url> [output-file] [expiration-hours] [parallel-jobs]" |
| 15 | + echo "Example: $0 https://myaccount.blob.core.windows.net/mycontainer output.jsonl 6 8" |
| 16 | + exit 1 |
| 17 | +fi |
| 18 | + |
| 19 | +CONTAINER_URL="$1" |
| 20 | +OUTPUT_FILE="${2:-signed_urls.jsonl}" |
| 21 | +EXPIRATION_HOURS="${3:-6}" # Default: 6 hours |
| 22 | +PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs |
| 23 | + |
| 24 | +# Remove trailing slash from container URL if present |
| 25 | +CONTAINER_URL="${CONTAINER_URL%/}" |
| 26 | + |
| 27 | +# Extract storage account and container from URL |
| 28 | +STORAGE_ACCOUNT=$(echo "$CONTAINER_URL" | sed -E 's|https://([^.]+)\.blob\.core\.windows\.net/.*|\1|') |
| 29 | +CONTAINER=$(echo "$CONTAINER_URL" | sed -E 's|https://[^/]+/([^/]+).*|\1|') |
| 30 | + |
| 31 | +# Optional: Extract path prefix if provided in URL |
| 32 | +PATH_PREFIX=$(echo "$CONTAINER_URL" | sed -E 's|https://[^/]+/[^/]+/?(.*)|/\1|' | sed 's|^//$||') |
| 33 | +if [ "$PATH_PREFIX" = "/" ]; then |
| 34 | + PATH_PREFIX="" |
| 35 | +fi |
| 36 | + |
| 37 | +# Image file extensions to include (regex pattern for grep) |
| 38 | +IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$' |
| 39 | + |
| 40 | +# Calculate expiry time in UTC (cross-platform compatible) |
| 41 | +if date --version >/dev/null 2>&1; then |
| 42 | + # GNU date (Linux) |
| 43 | + EXPIRY=$(date -u -d "+${EXPIRATION_HOURS} hours" '+%Y-%m-%dT%H:%MZ') |
| 44 | +else |
| 45 | + # BSD date (macOS) |
| 46 | + EXPIRY=$(date -u -v+${EXPIRATION_HOURS}H '+%Y-%m-%dT%H:%MZ') |
| 47 | +fi |
| 48 | + |
| 49 | +# Function to process a single blob |
| 50 | +process_blob() { |
| 51 | + local blob_name="$1" |
| 52 | + local storage_account="$2" |
| 53 | + local container="$3" |
| 54 | + local expiry="$4" |
| 55 | + |
| 56 | + # Generate SAS token for the specific blob (redirect stderr to suppress warnings) |
| 57 | + local sas_token=$(az storage blob generate-sas \ |
| 58 | + --account-name "$storage_account" \ |
| 59 | + --container-name "$container" \ |
| 60 | + --name "$blob_name" \ |
| 61 | + --permissions r \ |
| 62 | + --expiry "$expiry" \ |
| 63 | + --https-only \ |
| 64 | + --auth-mode key \ |
| 65 | + --output tsv 2>/dev/null) |
| 66 | + |
| 67 | + if [ $? -eq 0 ]; then |
| 68 | + # Construct the full URL with SAS token |
| 69 | + local signed_url="https://${storage_account}.blob.core.windows.net/${container}/${blob_name}?${sas_token}" |
| 70 | + |
| 71 | + # Create name with full path using double underscores instead of slashes |
| 72 | + local name_with_path=$(echo "$blob_name" | sed 's|/|__|g') |
| 73 | + |
| 74 | + # Output JSONL |
| 75 | + echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}" |
| 76 | + fi |
| 77 | +} |
| 78 | + |
| 79 | +# Alternative function using connection string or SAS token at account level |
| 80 | +process_blob_with_connection() { |
| 81 | + local blob_name="$1" |
| 82 | + local storage_account="$2" |
| 83 | + local container="$3" |
| 84 | + local expiry="$4" |
| 85 | + |
| 86 | + # Generate SAS token using connection string if AZURE_STORAGE_CONNECTION_STRING is set |
| 87 | + if [ -n "$AZURE_STORAGE_CONNECTION_STRING" ]; then |
| 88 | + local sas_token=$(az storage blob generate-sas \ |
| 89 | + --connection-string "$AZURE_STORAGE_CONNECTION_STRING" \ |
| 90 | + --container-name "$container" \ |
| 91 | + --name "$blob_name" \ |
| 92 | + --permissions r \ |
| 93 | + --expiry "$expiry" \ |
| 94 | + --https-only \ |
| 95 | + --output tsv 2>/dev/null) |
| 96 | + else |
| 97 | + # Use account key if available |
| 98 | + local sas_token=$(az storage blob generate-sas \ |
| 99 | + --account-name "$storage_account" \ |
| 100 | + --container-name "$container" \ |
| 101 | + --name "$blob_name" \ |
| 102 | + --permissions r \ |
| 103 | + --expiry "$expiry" \ |
| 104 | + --https-only \ |
| 105 | + --output tsv 2>/dev/null) |
| 106 | + fi |
| 107 | + |
| 108 | + if [ $? -eq 0 ]; then |
| 109 | + local signed_url="https://${storage_account}.blob.core.windows.net/${container}/${blob_name}?${sas_token}" |
| 110 | + local name_with_path=$(echo "$blob_name" | sed 's|/|__|g') |
| 111 | + echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}" |
| 112 | + fi |
| 113 | +} |
| 114 | + |
| 115 | +# Check if user is logged in to Azure CLI |
| 116 | +if ! az account show &>/dev/null; then |
| 117 | + echo "Error: Not logged in to Azure CLI. Please run 'az login' first." |
| 118 | + exit 1 |
| 119 | +fi |
| 120 | + |
| 121 | +# Export function and variables for xargs |
| 122 | +export -f process_blob process_blob_with_connection |
| 123 | +export STORAGE_ACCOUNT CONTAINER EXPIRY |
| 124 | +export AZURE_STORAGE_CONNECTION_STRING |
| 125 | + |
| 126 | +echo "Listing blobs from container: $CONTAINER in account: $STORAGE_ACCOUNT..." |
| 127 | +if [ -n "$PATH_PREFIX" ]; then |
| 128 | + echo "Using path prefix: $PATH_PREFIX" |
| 129 | +fi |
| 130 | + |
| 131 | +# Get list of all blobs, filter for images, and process in parallel |
| 132 | +# Create a temporary file for the blob list to avoid stdin issues |
| 133 | +BLOB_LIST=$(mktemp) |
| 134 | +trap "rm -f $BLOB_LIST" EXIT |
| 135 | + |
| 136 | +if [ -n "$PATH_PREFIX" ]; then |
| 137 | + # List blobs with prefix |
| 138 | + az storage blob list \ |
| 139 | + --account-name "$STORAGE_ACCOUNT" \ |
| 140 | + --container-name "$CONTAINER" \ |
| 141 | + --prefix "$PATH_PREFIX" \ |
| 142 | + --auth-mode key \ |
| 143 | + --query "[].name" \ |
| 144 | + --output tsv 2>/dev/null | grep -iE "$IMAGE_PATTERN" > "$BLOB_LIST" |
| 145 | +else |
| 146 | + # List all blobs in container |
| 147 | + az storage blob list \ |
| 148 | + --account-name "$STORAGE_ACCOUNT" \ |
| 149 | + --container-name "$CONTAINER" \ |
| 150 | + --auth-mode key \ |
| 151 | + --query "[].name" \ |
| 152 | + --output tsv 2>/dev/null | grep -iE "$IMAGE_PATTERN" > "$BLOB_LIST" |
| 153 | +fi |
| 154 | + |
| 155 | +# Process blobs in parallel using background jobs |
| 156 | +: > "$OUTPUT_FILE" # Clear output file |
| 157 | +COUNT=0 |
| 158 | + |
| 159 | +while IFS= read -r blob_name; do |
| 160 | + # Process blob in background |
| 161 | + process_blob "$blob_name" "$STORAGE_ACCOUNT" "$CONTAINER" "$EXPIRY" >> "$OUTPUT_FILE" & |
| 162 | + |
| 163 | + # Limit concurrent jobs |
| 164 | + ((COUNT++)) |
| 165 | + if [ $((COUNT % PARALLEL_JOBS)) -eq 0 ]; then |
| 166 | + wait # Wait for current batch to complete |
| 167 | + fi |
| 168 | +done < "$BLOB_LIST" |
| 169 | + |
| 170 | +# Wait for any remaining jobs |
| 171 | +wait |
| 172 | + |
| 173 | +# Display the results |
| 174 | +cat "$OUTPUT_FILE" |
| 175 | + |
| 176 | +echo "" |
| 177 | +echo "Done! SAS URLs written to $OUTPUT_FILE" |
| 178 | +echo "Total images processed: $(wc -l < "$OUTPUT_FILE" 2>/dev/null || echo 0)" |
| 179 | +echo "SAS tokens valid until: $EXPIRY" |
0 commit comments