Skip to content

Commit 07b639a

Browse files
authored
Merge pull request #423 from roboflow/azure-cloud-script
Add script to help generate Azure SAS to be used for batch processing. Follow-up from adjacent S3 and GCS scripts which do the same.
2 parents 3503694 + 9839990 commit 07b639a

File tree

1 file changed

+179
-0
lines changed

1 file changed

+179
-0
lines changed

scripts/generateAzureSasUrls.sh

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/bin/bash
2+
# Script to generate Azure Blob Storage SAS URLs for image files in JSONL format
3+
# requires az cli installed and logged in https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest
4+
# Usage: ./generateAzureSasUrls.sh <container-url> [output-file] [expiration-hours] [parallel-jobs]
5+
# Example: ./generateAzureSasUrls.sh https://myaccount.blob.core.windows.net/mycontainer output.jsonl 6 8
6+
# Or with curl:
7+
# curl -fsSL https://raw.githubusercontent.com/roboflow/roboflow-python/main/scripts/generateAzureSasUrls.sh | bash -s -- https://myaccount.blob.core.windows.net/mycontainer output.jsonl
8+
9+
set -e
10+
11+
# Check if container URL is provided
12+
if [ -z "$1" ]; then
13+
echo "Error: Azure container URL is required"
14+
echo "Usage: $0 <container-url> [output-file] [expiration-hours] [parallel-jobs]"
15+
echo "Example: $0 https://myaccount.blob.core.windows.net/mycontainer output.jsonl 6 8"
16+
exit 1
17+
fi
18+
19+
CONTAINER_URL="$1"
20+
OUTPUT_FILE="${2:-signed_urls.jsonl}"
21+
EXPIRATION_HOURS="${3:-6}" # Default: 6 hours
22+
PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs
23+
24+
# Remove trailing slash from container URL if present
25+
CONTAINER_URL="${CONTAINER_URL%/}"
26+
27+
# Extract storage account and container from URL
28+
STORAGE_ACCOUNT=$(echo "$CONTAINER_URL" | sed -E 's|https://([^.]+)\.blob\.core\.windows\.net/.*|\1|')
29+
CONTAINER=$(echo "$CONTAINER_URL" | sed -E 's|https://[^/]+/([^/]+).*|\1|')
30+
31+
# Optional: Extract path prefix if provided in URL
32+
PATH_PREFIX=$(echo "$CONTAINER_URL" | sed -E 's|https://[^/]+/[^/]+/?(.*)|/\1|' | sed 's|^//$||')
33+
if [ "$PATH_PREFIX" = "/" ]; then
34+
PATH_PREFIX=""
35+
fi
36+
37+
# Image file extensions to include (regex pattern for grep)
38+
IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$'
39+
40+
# Calculate expiry time in UTC (cross-platform compatible)
41+
if date --version >/dev/null 2>&1; then
42+
# GNU date (Linux)
43+
EXPIRY=$(date -u -d "+${EXPIRATION_HOURS} hours" '+%Y-%m-%dT%H:%MZ')
44+
else
45+
# BSD date (macOS)
46+
EXPIRY=$(date -u -v+${EXPIRATION_HOURS}H '+%Y-%m-%dT%H:%MZ')
47+
fi
48+
49+
# Function to process a single blob
50+
process_blob() {
51+
local blob_name="$1"
52+
local storage_account="$2"
53+
local container="$3"
54+
local expiry="$4"
55+
56+
# Generate SAS token for the specific blob (redirect stderr to suppress warnings)
57+
local sas_token=$(az storage blob generate-sas \
58+
--account-name "$storage_account" \
59+
--container-name "$container" \
60+
--name "$blob_name" \
61+
--permissions r \
62+
--expiry "$expiry" \
63+
--https-only \
64+
--auth-mode key \
65+
--output tsv 2>/dev/null)
66+
67+
if [ $? -eq 0 ]; then
68+
# Construct the full URL with SAS token
69+
local signed_url="https://${storage_account}.blob.core.windows.net/${container}/${blob_name}?${sas_token}"
70+
71+
# Create name with full path using double underscores instead of slashes
72+
local name_with_path=$(echo "$blob_name" | sed 's|/|__|g')
73+
74+
# Output JSONL
75+
echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}"
76+
fi
77+
}
78+
79+
# Alternative function using connection string or SAS token at account level
80+
process_blob_with_connection() {
81+
local blob_name="$1"
82+
local storage_account="$2"
83+
local container="$3"
84+
local expiry="$4"
85+
86+
# Generate SAS token using connection string if AZURE_STORAGE_CONNECTION_STRING is set
87+
if [ -n "$AZURE_STORAGE_CONNECTION_STRING" ]; then
88+
local sas_token=$(az storage blob generate-sas \
89+
--connection-string "$AZURE_STORAGE_CONNECTION_STRING" \
90+
--container-name "$container" \
91+
--name "$blob_name" \
92+
--permissions r \
93+
--expiry "$expiry" \
94+
--https-only \
95+
--output tsv 2>/dev/null)
96+
else
97+
# Use account key if available
98+
local sas_token=$(az storage blob generate-sas \
99+
--account-name "$storage_account" \
100+
--container-name "$container" \
101+
--name "$blob_name" \
102+
--permissions r \
103+
--expiry "$expiry" \
104+
--https-only \
105+
--output tsv 2>/dev/null)
106+
fi
107+
108+
if [ $? -eq 0 ]; then
109+
local signed_url="https://${storage_account}.blob.core.windows.net/${container}/${blob_name}?${sas_token}"
110+
local name_with_path=$(echo "$blob_name" | sed 's|/|__|g')
111+
echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}"
112+
fi
113+
}
114+
115+
# Check if user is logged in to Azure CLI
116+
if ! az account show &>/dev/null; then
117+
echo "Error: Not logged in to Azure CLI. Please run 'az login' first."
118+
exit 1
119+
fi
120+
121+
# Export function and variables for xargs
122+
export -f process_blob process_blob_with_connection
123+
export STORAGE_ACCOUNT CONTAINER EXPIRY
124+
export AZURE_STORAGE_CONNECTION_STRING
125+
126+
echo "Listing blobs from container: $CONTAINER in account: $STORAGE_ACCOUNT..."
127+
if [ -n "$PATH_PREFIX" ]; then
128+
echo "Using path prefix: $PATH_PREFIX"
129+
fi
130+
131+
# Get list of all blobs, filter for images, and process in parallel
132+
# Create a temporary file for the blob list to avoid stdin issues
133+
BLOB_LIST=$(mktemp)
134+
trap "rm -f $BLOB_LIST" EXIT
135+
136+
if [ -n "$PATH_PREFIX" ]; then
137+
# List blobs with prefix
138+
az storage blob list \
139+
--account-name "$STORAGE_ACCOUNT" \
140+
--container-name "$CONTAINER" \
141+
--prefix "$PATH_PREFIX" \
142+
--auth-mode key \
143+
--query "[].name" \
144+
--output tsv 2>/dev/null | grep -iE "$IMAGE_PATTERN" > "$BLOB_LIST"
145+
else
146+
# List all blobs in container
147+
az storage blob list \
148+
--account-name "$STORAGE_ACCOUNT" \
149+
--container-name "$CONTAINER" \
150+
--auth-mode key \
151+
--query "[].name" \
152+
--output tsv 2>/dev/null | grep -iE "$IMAGE_PATTERN" > "$BLOB_LIST"
153+
fi
154+
155+
# Process blobs in parallel using background jobs
156+
: > "$OUTPUT_FILE" # Clear output file
157+
COUNT=0
158+
159+
while IFS= read -r blob_name; do
160+
# Process blob in background
161+
process_blob "$blob_name" "$STORAGE_ACCOUNT" "$CONTAINER" "$EXPIRY" >> "$OUTPUT_FILE" &
162+
163+
# Limit concurrent jobs
164+
((COUNT++))
165+
if [ $((COUNT % PARALLEL_JOBS)) -eq 0 ]; then
166+
wait # Wait for current batch to complete
167+
fi
168+
done < "$BLOB_LIST"
169+
170+
# Wait for any remaining jobs
171+
wait
172+
173+
# Display the results
174+
cat "$OUTPUT_FILE"
175+
176+
echo ""
177+
echo "Done! SAS URLs written to $OUTPUT_FILE"
178+
echo "Total images processed: $(wc -l < "$OUTPUT_FILE" 2>/dev/null || echo 0)"
179+
echo "SAS tokens valid until: $EXPIRY"

0 commit comments

Comments
 (0)