diff --git a/scripts/README_storage_tier.md b/scripts/README_change_storage_tier.md similarity index 63% rename from scripts/README_storage_tier.md rename to scripts/README_change_storage_tier.md index b7df50a..f97bad5 100644 --- a/scripts/README_storage_tier.md +++ b/scripts/README_change_storage_tier.md @@ -50,21 +50,11 @@ uv run python scripts/change_storage_tier.py \ For easier command execution, define the STAC item ID as a variable: ```bash -ITEM_ID="S2B_MSIL2A_20250730T113319_N0511_R080_T29UQP_20250730T135754" +ITEM_ID="S2A_MSIL2A_20251209T123131_N0511_R009_T26SPG_20251209T163109" ``` ## Usage -### Basic Usage - -Run the script using the STAC item ID variable defined in the setup: - -```bash -uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER -``` - ### Dry Run Test the script without making actual changes. Dry-run mode will: @@ -75,17 +65,27 @@ Test the script without making actual changes. Dry-run mode will: ```bash uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --dry-run ``` +### Basic Usage + +Run the script using the STAC item ID variable defined in the setup: + +```bash +uv run python scripts/change_storage_tier.py \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA +``` + ### With Custom S3 Endpoint ```bash uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --s3-endpoint https://s3.de.io.cloud.ovh.net ``` @@ -96,45 +96,80 @@ Only change storage class for specific parts of the Zarr store: ```bash # Only process reflectance data uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "measurements/reflectance/*" # Process multiple subdirectories uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "measurements/*" \ --include-pattern "quality/*" # Exclude metadata files uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --exclude-pattern "*.zattrs" \ --exclude-pattern "*.zmetadata" # Only process 60m resolution data uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "*/r60m/*" ``` ## Available Storage Classes - **STANDARD** - Standard storage tier (default, immediate access, higher cost) -- **GLACIER** - Archive storage tier (lower cost, retrieval required before access) +- **STANDARD_IA** - Archive storage tier (lower cost, retrieval required before access) - **EXPRESS_ONEZONE** - High-performance storage tier (single availability zone) +### OVH Cloud Storage Classes + +**Important**: This script uses OVH Cloud Storage class naming directly to avoid confusion. + +**Supported Storage Classes:** +- `STANDARD` - Standard storage (default) +- `STANDARD_IA` - Standard, Infrequent Access (archive storage, low-cost) +- `EXPRESS_ONEZONE` - High Performance (low-latency storage) + +**Full AWS to OVH Storage Class Mapping:** + +| AWS Storage Class | OVH Storage Class | CLI Value (this script) | +|-------------------|-------------------|------------------------| +| `EXPRESS_ONEZONE` | High Performance | `EXPRESS_ONEZONE` | +| `STANDARD` | Standard | `STANDARD` | +| `INTELLIGENT_TIERING` | Standard | `STANDARD` | +| `STANDARD_IA` | Standard, Infrequent Access | `STANDARD_IA` | +| `ONEZONE_IA` | Standard, Infrequent Access | `STANDARD_IA` | +| `GLACIER_IR` | Standard, Infrequent Access | `STANDARD_IA` | +| `GLACIER` | Standard, Infrequent Access | `STANDARD_IA` | +| `DEEP_ARCHIVE` | Cold Archive | N/A (not supported) | + +**Note**: Multiple AWS storage classes map to the same OVH tier. This script uses OVH naming (`STANDARD_IA`) instead of AWS naming (`GLACIER`) to avoid confusion. + +**Reference**: [OVH Cloud Storage S3 Location Documentation](https://help.ovhcloud.com/csm/en-public-cloud-storage-s3-location?id=kb_article_view&sysparm_article=KB0047384) + ## How It Works 1. Fetches the STAC item from the provided URL 2. Extracts S3 URLs from the `alternate.s3.href` fields in each asset 3. Identifies the root Zarr store location -4. Lists all objects in the Zarr store recursively +4. Lists all objects in the Zarr store recursively (includes storage class in response) 5. Optionally filters objects based on include/exclude patterns -6. Changes the storage class for each object using the S3 API +6. **Optimization**: Skips objects already at target storage class (no API calls) +7. Changes the storage class only for objects that need it using the S3 API + +### Performance Optimizations + +The script has been optimized to minimize S3 API calls: + +- **Storage class from list**: Retrieves storage class during initial listing (no extra `head_object` calls) +- **Smart filtering**: Only makes `copy_object` API calls for objects that actually need to change storage class +- **Progress tracking**: Shows how many objects need changes vs. already correct ## Path Filtering @@ -187,8 +222,8 @@ uv run python scripts/register_v1.py \ # 3. Change storage tier (optional) ITEM_ID="your-item-id" uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA ``` ## Error Handling @@ -213,8 +248,8 @@ The script provides detailed logging at different levels: Set the `LOG_LEVEL` environment variable to control verbosity: ```bash LOG_LEVEL=DEBUG uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA ``` ## Examples @@ -233,34 +268,52 @@ Use dry-run to see the current storage classes without making changes: ```bash uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --dry-run ``` Output example: ``` -Summary for S2A_MSIL2A_20250831T103701_N0511_R008_T31TFL_20250831T145420: - Total objects: 1500 - Skipped (filtered): 0 - Processed: 1500 - Succeeded: 1500 - Failed: 0 - -Current storage class distribution: - GLACIER: 300 objects (20.0%) - STANDARD: 1200 objects (80.0%) +Processing: S2A_MSIL2A_20251209T123131_N0511_R009_T26SPG_20251209T163109 +Target storage class: STANDARD +Include patterns: */r60m/* +Found 4 S3 URLs +Zarr root: s3://esa-zarr-sentinel-explorer-fra/.../S2A_MSIL2A_20251209T123131_N0511_R009_T26SPG_20251209T163109.zarr +Listing objects in s3://esa-zarr-sentinel-explorer-fra/.../S2A_MSIL2A_20251209T123131_N0511_R009_T26SPG_20251209T163109.zarr/ +Found 1058 total objects +After filtering: 260 objects to process, 798 excluded + +Initial storage class distribution (before changes): + EXPRESS_ONEZONE: 260 objects (100.0%) (DRY RUN) + +Processing 260 objects... + 0 already have target storage class STANDARD + 260 need to be changed + Progress: 100/260 objects (38%) + Progress: 200/260 objects (76%) + Progress: 260/260 objects (100%) +============================================================ +Summary for S2A_MSIL2A_20251209T123131_N0511_R009_T26SPG_20251209T163109: + Total objects: 1058 + Skipped (filtered): 798 + Already correct storage class: 0 + Changed: 260 + Succeeded: 260 + Failed: 0 ``` +**Note**: The storage class distribution is shown before processing starts. When not in dry-run mode, an expected distribution after changes is also displayed. To verify changes were applied, run the same command again - you should see all objects already at the target storage class. + ### Preview changes for specific data subset Test what would happen when archiving only 60m resolution data: ```bash uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "*/r60m/*" \ --dry-run ``` @@ -269,8 +322,8 @@ uv run python scripts/change_storage_tier.py \ ```bash uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "measurements/reflectance/*" \ --dry-run ``` @@ -279,27 +332,27 @@ uv run python scripts/change_storage_tier.py \ ```bash uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --exclude-pattern "*.zattrs" \ --exclude-pattern "*.zmetadata" \ --dry-run ``` -### Archive only reflectance data to GLACIER +### Archive only reflectance data to STANDARD_IA ```bash # First, preview the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "measurements/reflectance/*" \ --dry-run # Then apply the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "measurements/reflectance/*" ``` @@ -308,47 +361,47 @@ uv run python scripts/change_storage_tier.py \ ```bash # Preview first uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "measurements/*" \ --exclude-pattern "*/r10m/*" \ --dry-run # Apply changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --include-pattern "measurements/*" \ --exclude-pattern "*/r10m/*" ``` -### Archive old data to GLACIER +### Archive old data to STANDARD_IA ```bash # Preview the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA \ --dry-run # Apply the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ - --storage-class GLACIER + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ + --storage-class STANDARD_IA ``` -### Restore data from GLACIER to STANDARD +### Restore data from STANDARD_IA to STANDARD ```bash # Preview the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ --storage-class STANDARD \ --dry-run # Apply the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ --storage-class STANDARD ``` @@ -357,12 +410,12 @@ uv run python scripts/change_storage_tier.py \ ```bash # Preview the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ --storage-class EXPRESS_ONEZONE \ --dry-run # Apply the changes uv run python scripts/change_storage_tier.py \ - --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a/items/$ITEM_ID \ + --stac-item-url https://api.explorer.eopf.copernicus.eu/stac/collections/sentinel-2-l2a-staging/items/$ITEM_ID \ --storage-class EXPRESS_ONEZONE ``` diff --git a/scripts/change_storage_tier.py b/scripts/change_storage_tier.py index 22b24f5..e23f87e 100755 --- a/scripts/change_storage_tier.py +++ b/scripts/change_storage_tier.py @@ -29,7 +29,8 @@ logging.getLogger(lib).setLevel(logging.WARNING) # Valid S3 storage classes -VALID_STORAGE_CLASSES = frozenset(["STANDARD", "GLACIER", "EXPRESS_ONEZONE"]) +# Using OVH Cloud Storage naming: STANDARD_IA (Infrequent Access) instead of AWS GLACIER +VALID_STORAGE_CLASSES = frozenset(["STANDARD", "STANDARD_IA", "EXPRESS_ONEZONE"]) def validate_storage_class(storage_class: str) -> bool: @@ -71,53 +72,66 @@ def get_zarr_root(s3_urls: set[str]) -> str | None: return None -def list_objects(s3_client, bucket: str, prefix: str) -> list[str]: # type: ignore - """List all objects under S3 prefix.""" +def list_objects(s3_client, bucket: str, prefix: str) -> list[tuple[str, str]]: # type: ignore + """List all objects under S3 prefix with their storage class. + + Returns: + List of tuples (key, storage_class) + """ objects = [] paginator = s3_client.get_paginator("list_objects_v2") - for page in paginator.paginate(Bucket=bucket, Prefix=prefix): - for obj in page.get("Contents", []): - objects.append(obj["Key"]) + for page_count, page in enumerate(paginator.paginate(Bucket=bucket, Prefix=prefix), start=1): + page_objects = page.get("Contents", []) + for obj in page_objects: + # Storage class is included in list_objects response + # Note: S3 returns "STANDARD" implicitly when StorageClass is not present + # (objects in STANDARD tier don't always have the field set) + storage_class = obj.get("StorageClass", "STANDARD") + objects.append((obj["Key"], storage_class)) + + # Log progress every 10 pages (typically 10,000 objects) + if page_count % 10 == 0: + logger.info(f" Listed {len(objects)} objects so far ({page_count} pages)...") return objects def filter_paths( - paths: list[str], + objects: list[tuple[str, str]], include_patterns: list[str] | None = None, exclude_patterns: list[str] | None = None, zarr_prefix: str = "", -) -> tuple[list[str], list[str]]: - """Filter paths based on include/exclude patterns. +) -> tuple[list[tuple[str, str]], list[str]]: + """Filter objects based on include/exclude patterns. Args: - paths: List of object keys to filter - include_patterns: List of fnmatch patterns to include (relative to Zarr root) - exclude_patterns: List of fnmatch patterns to exclude (relative to Zarr root) - zarr_prefix: The Zarr root prefix to strip when matching patterns + objects: List of (key, storage_class) tuples + include_patterns: Patterns to include (relative to Zarr root) + exclude_patterns: Patterns to exclude (relative to Zarr root) + zarr_prefix: Prefix to remove from paths for pattern matching Returns: - Tuple of (filtered_paths, excluded_paths) + Tuple of (filtered_objects, excluded_keys) """ if not include_patterns and not exclude_patterns: - return paths, [] + return objects, [] filtered = [] excluded = [] - for path in paths: + for key, storage_class in objects: # Get relative path from Zarr root for pattern matching - if zarr_prefix and path.startswith(zarr_prefix): - relative_path = path[len(zarr_prefix) :] + if zarr_prefix and key.startswith(zarr_prefix): + relative_path = key[len(zarr_prefix) :] else: - relative_path = path + relative_path = key # Apply include patterns if include_patterns: included = any(fnmatch.fnmatch(relative_path, pattern) for pattern in include_patterns) if not included: - excluded.append(path) + excluded.append(key) continue # Apply exclude patterns @@ -126,10 +140,10 @@ def filter_paths( fnmatch.fnmatch(relative_path, pattern) for pattern in exclude_patterns ) if excluded_match: - excluded.append(path) + excluded.append(key) continue - filtered.append(path) + filtered.append((key, storage_class)) return filtered, excluded @@ -138,44 +152,53 @@ def change_object_storage_class( # type: ignore s3_client, bucket: str, key: str, - storage_class: str, + current_storage_class: str, + target_storage_class: str, dry_run: bool, -) -> tuple[bool, str | None]: +) -> tuple[bool, str]: """Change storage class of single S3 object. + Args: + s3_client: Boto3 S3 client + bucket: S3 bucket name + key: S3 object key + current_storage_class: Current storage class (from list_objects) + target_storage_class: Desired storage class + dry_run: If True, don't make actual changes + Returns: - Tuple of (success: bool, current_storage_class: str | None) + Tuple of (success: bool, current_storage_class: str) """ try: - head = s3_client.head_object(Bucket=bucket, Key=key) - current = head.get("StorageClass", "STANDARD") - if dry_run: - if current == storage_class: - logger.debug(f"[DRY RUN] Already {storage_class}: s3://{bucket}/{key}") + if current_storage_class == target_storage_class: + logger.debug(f"[DRY RUN] Already {target_storage_class}: s3://{bucket}/{key}") else: logger.debug( - f"[DRY RUN] Would change {current} -> {storage_class}: s3://{bucket}/{key}" + f"[DRY RUN] Would change {current_storage_class} -> {target_storage_class}: s3://{bucket}/{key}" ) - return True, current + return True, current_storage_class - if current == storage_class: - logger.debug(f"Already {storage_class}: s3://{bucket}/{key}") - return True, current + if current_storage_class == target_storage_class: + logger.debug(f"Already {target_storage_class}: s3://{bucket}/{key}") + return True, current_storage_class + # Only make API call when actually changing storage class s3_client.copy_object( Bucket=bucket, Key=key, CopySource={"Bucket": bucket, "Key": key}, - StorageClass=storage_class, + StorageClass=target_storage_class, MetadataDirective="COPY", ) - logger.debug(f"Changed {current} -> {storage_class}: s3://{bucket}/{key}") - return True, current + logger.debug( + f"Changed {current_storage_class} -> {target_storage_class}: s3://{bucket}/{key}" + ) + return True, current_storage_class except ClientError as e: logger.error(f"Failed to change s3://{bucket}/{key}: {e}") - return False, None + return False, current_storage_class def process_stac_item( @@ -254,39 +277,91 @@ def process_stac_item( stats = {"processed": 0, "succeeded": 0, "failed": 0, "skipped": len(excluded)} storage_class_counts: dict[str, int] = {} - for obj_key in objects: + # Separate objects that need changes from those that don't + objects_to_change = [(key, current) for key, current in objects if current != storage_class] + objects_already_correct = [ + (key, current) for key, current in objects if current == storage_class + ] + + total_objects = len(objects) + + # Count storage class distribution + for _, current_class in objects: + storage_class_counts[current_class] = storage_class_counts.get(current_class, 0) + 1 + + # Show initial distribution before processing + if storage_class_counts: + logger.info("") + logger.info("Initial storage class distribution (before changes):") + total = sum(storage_class_counts.values()) + for sc in sorted(storage_class_counts.keys()): + count = storage_class_counts[sc] + percentage = (count / total * 100) if total > 0 else 0 + logger.info(f" {sc}: {count} objects ({percentage:.1f}%)") + + # Show expected distribution after changes + if not dry_run and len(objects_to_change) > 0: + logger.info("") + logger.info("Expected storage class distribution (after changes):") + expected_counts = storage_class_counts.copy() + # Remove changed objects from their old classes + for _, old_class in objects_to_change: + expected_counts[old_class] = expected_counts.get(old_class, 0) - 1 + if expected_counts[old_class] == 0: + del expected_counts[old_class] + # Add changed objects to target class + expected_counts[storage_class] = expected_counts.get(storage_class, 0) + len( + objects_to_change + ) + + expected_total = sum(expected_counts.values()) + for sc in sorted(expected_counts.keys()): + count = expected_counts[sc] + percentage = (count / expected_total * 100) if expected_total > 0 else 0 + logger.info(f" {sc}: {count} objects ({percentage:.1f}%)") + + if dry_run: + logger.info(" (DRY RUN)") + logger.info("") + + logger.info(f"Processing {total_objects} objects...") + logger.info( + f" {len(objects_already_correct)} already have target storage class {storage_class}" + ) + logger.info(f" {len(objects_to_change)} need to be changed") + + # Count objects that already have correct storage class (no API calls needed) + stats["processed"] += len(objects_already_correct) + stats["succeeded"] += len(objects_already_correct) + + # Process objects that need to change + for processed_count, (obj_key, current_class) in enumerate(objects_to_change, start=1): stats["processed"] += 1 - success, current_class = change_object_storage_class( - s3_client, bucket, obj_key, storage_class, dry_run + + success, _ = change_object_storage_class( + s3_client, bucket, obj_key, current_class, storage_class, dry_run ) if success: stats["succeeded"] += 1 - if current_class: - storage_class_counts[current_class] = storage_class_counts.get(current_class, 0) + 1 else: stats["failed"] += 1 + # Log progress every 100 objects or at the end + if processed_count % 100 == 0 or processed_count == len(objects_to_change): + logger.info( + f" Progress: {stats['processed']}/{total_objects} objects ({stats['processed']*100//max(total_objects, 1)}%)" + ) + # Summary logger.info("=" * 60) logger.info(f"Summary for {item_id}:") logger.info(f" Total objects: {len(all_objects)}") logger.info(f" Skipped (filtered): {stats['skipped']}") - logger.info(f" Processed: {stats['processed']}") + logger.info(f" Already correct storage class: {len(objects_already_correct)}") + logger.info(f" Changed: {len(objects_to_change)}") logger.info(f" Succeeded: {stats['succeeded']}") logger.info(f" Failed: {stats['failed']}") - if dry_run and storage_class_counts: - logger.info("") - logger.info("Current storage class distribution:") - total = sum(storage_class_counts.values()) - for sc in sorted(storage_class_counts.keys()): - count = storage_class_counts[sc] - percentage = (count / total * 100) if total > 0 else 0 - logger.info(f" {sc}: {count} objects ({percentage:.1f}%)") - - if dry_run: - logger.info(" (DRY RUN)") - return stats @@ -297,7 +372,7 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument( "--storage-class", default="STANDARD", - choices=["STANDARD", "GLACIER", "EXPRESS_ONEZONE"], + choices=["STANDARD", "STANDARD_IA", "EXPRESS_ONEZONE"], help="Target storage class", ) parser.add_argument("--dry-run", action="store_true", help="Dry run mode") diff --git a/tests/unit/test_change_storage_tier.py b/tests/unit/test_change_storage_tier.py index 68e692f..9e687d1 100644 --- a/tests/unit/test_change_storage_tier.py +++ b/tests/unit/test_change_storage_tier.py @@ -25,9 +25,9 @@ def test_valid_standard(self) -> None: """Test STANDARD is valid.""" assert validate_storage_class("STANDARD") is True - def test_valid_glacier(self) -> None: - """Test GLACIER is valid.""" - assert validate_storage_class("GLACIER") is True + def test_valid_standard_ia(self) -> None: + """Test STANDARD_IA is valid.""" + assert validate_storage_class("STANDARD_IA") is True def test_valid_express_onezone(self) -> None: """Test EXPRESS_ONEZONE is valid.""" @@ -41,7 +41,7 @@ def test_invalid_storage_class(self) -> None: def test_valid_storage_classes_list(self) -> None: """Test all valid storage classes.""" - valid_classes = ["STANDARD", "GLACIER", "EXPRESS_ONEZONE"] + valid_classes = ["STANDARD", "STANDARD_IA", "EXPRESS_ONEZONE"] for sc in valid_classes: assert validate_storage_class(sc) is True @@ -234,32 +234,36 @@ def sample_zarr_paths(self) -> list[str]: def test_filter_paths_no_patterns(self, sample_zarr_paths: list[str]) -> None: """Test that all paths pass when no patterns specified.""" - filtered, excluded = filter_paths(sample_zarr_paths) - assert filtered == sample_zarr_paths + # Convert paths to (key, storage_class) tuples + objects = [(path, "STANDARD") for path in sample_zarr_paths] + filtered, excluded = filter_paths(objects) + assert filtered == objects assert excluded == [] def test_filter_paths_include_pattern(self, sample_zarr_paths: list[str]) -> None: """Test include pattern to select only 10m resolution bands.""" zarr_prefix = "geozarr/S2A_test.zarr/" + objects = [(path, "STANDARD") for path in sample_zarr_paths] filtered, excluded = filter_paths( - sample_zarr_paths, + objects, include_patterns=["measurements/reflectance/r10m/*"], zarr_prefix=zarr_prefix, ) assert len(filtered) == 3 - assert all("r10m" in p for p in filtered) + assert all("r10m" in key for key, _ in filtered) assert len(excluded) == 6 def test_filter_paths_exclude_pattern(self, sample_zarr_paths: list[str]) -> None: """Test exclude pattern to skip Zarr metadata files.""" zarr_prefix = "geozarr/S2A_test.zarr/" + objects = [(path, "STANDARD") for path in sample_zarr_paths] filtered, excluded = filter_paths( - sample_zarr_paths, + objects, exclude_patterns=["*.zattrs", "*.zgroup", "*.zarray"], zarr_prefix=zarr_prefix, ) assert len(filtered) == 5 - assert all(not p.endswith((".zattrs", ".zgroup", ".zarray")) for p in filtered) + assert all(not key.endswith((".zattrs", ".zgroup", ".zarray")) for key, _ in filtered) def test_filter_paths_include_and_exclude(self, sample_zarr_paths: list[str]) -> None: """Test combined include (10m bands) and exclude (60m resolution).""" @@ -270,14 +274,15 @@ def test_filter_paths_include_and_exclude(self, sample_zarr_paths: list[str]) -> f"{zarr_prefix}measurements/reflectance/r20m/B05/0", f"{zarr_prefix}measurements/reflectance/r60m/B01/0", ] + objects = [(path, "STANDARD") for path in paths] filtered, excluded = filter_paths( - paths, + objects, include_patterns=["measurements/reflectance/*"], exclude_patterns=["*/r60m/*"], zarr_prefix=zarr_prefix, ) assert len(filtered) == 3 - assert all("r60m" not in p for p in filtered) + assert all("r60m" not in key for key, _ in filtered) def test_filter_paths_multiple_include_patterns(self) -> None: """Test multiple include patterns (OR logic) for reflectance and quality.""" @@ -287,8 +292,9 @@ def test_filter_paths_multiple_include_patterns(self) -> None: f"{zarr_prefix}measurements/quality/cloud_mask/0", f"{zarr_prefix}metadata/product_info/data", ] + objects = [(path, "STANDARD") for path in paths] filtered, excluded = filter_paths( - paths, + objects, include_patterns=["measurements/reflectance/*", "measurements/quality/*"], zarr_prefix=zarr_prefix, ) @@ -303,8 +309,9 @@ def test_filter_paths_wildcard_patterns(self) -> None: f"{zarr_prefix}measurements/reflectance/r20m/B05/0", f"{zarr_prefix}measurements/reflectance/r60m/B01/0", ] + objects = [(path, "STANDARD") for path in paths] filtered, excluded = filter_paths( - paths, include_patterns=["measurements/reflectance/r?0m/*"], zarr_prefix=zarr_prefix + objects, include_patterns=["measurements/reflectance/r?0m/*"], zarr_prefix=zarr_prefix ) assert len(filtered) == 3 # All match r?0m pattern @@ -322,22 +329,38 @@ def mock_s3_client(self) -> MagicMock: mock_paginator = MagicMock() mock_client.get_paginator.return_value = mock_paginator # Default response simulating realistic S3 listing with multiple pages + # StorageClass is included in list_objects_v2 response mock_paginator.paginate.return_value = [ { "Contents": [ - {"Key": f"{self.PREFIX}measurements/reflectance/r10m/B02/0"}, - {"Key": f"{self.PREFIX}measurements/reflectance/r10m/B02/.zarray"}, + { + "Key": f"{self.PREFIX}measurements/reflectance/r10m/B02/0", + "StorageClass": "STANDARD", + }, + { + "Key": f"{self.PREFIX}measurements/reflectance/r10m/B02/.zarray", + "StorageClass": "STANDARD", + }, ] }, { "Contents": [ - {"Key": f"{self.PREFIX}measurements/reflectance/r10m/B03/0"}, + { + "Key": f"{self.PREFIX}measurements/reflectance/r10m/B03/0", + "StorageClass": "STANDARD_IA", + }, ] }, { "Contents": [ - {"Key": f"{self.PREFIX}measurements/reflectance/r20m/B05/0"}, - {"Key": f"{self.PREFIX}measurements/reflectance/r20m/B05/.zarray"}, + { + "Key": f"{self.PREFIX}measurements/reflectance/r20m/B05/0", + "StorageClass": "STANDARD", + }, + { + "Key": f"{self.PREFIX}measurements/reflectance/r20m/B05/.zarray", + "StorageClass": "STANDARD", + }, ] }, ] @@ -364,11 +387,11 @@ def test_list_objects_aggregates_multiple_pages(self, mock_s3_client: MagicMock) assert len(objects) == 5 assert objects == [ - f"{self.PREFIX}measurements/reflectance/r10m/B02/0", - f"{self.PREFIX}measurements/reflectance/r10m/B02/.zarray", - f"{self.PREFIX}measurements/reflectance/r10m/B03/0", - f"{self.PREFIX}measurements/reflectance/r20m/B05/0", - f"{self.PREFIX}measurements/reflectance/r20m/B05/.zarray", + (f"{self.PREFIX}measurements/reflectance/r10m/B02/0", "STANDARD"), + (f"{self.PREFIX}measurements/reflectance/r10m/B02/.zarray", "STANDARD"), + (f"{self.PREFIX}measurements/reflectance/r10m/B03/0", "STANDARD_IA"), + (f"{self.PREFIX}measurements/reflectance/r20m/B05/0", "STANDARD"), + (f"{self.PREFIX}measurements/reflectance/r20m/B05/.zarray", "STANDARD"), ] def test_list_objects_handles_empty_prefix(self, mock_s3_client: MagicMock) -> None: @@ -398,64 +421,65 @@ class TestChangeObjectStorageClass: OBJECT_KEY = "geozarr/S2A_test.zarr/measurements/reflectance/r10m/B02/0" def test_dry_run_mode(self) -> None: - """Test dry run queries storage class but doesn't modify objects.""" + """Test dry run doesn't modify objects (no API calls needed).""" mock_client = MagicMock() - mock_client.head_object.return_value = {"StorageClass": "STANDARD"} success, current_class = change_object_storage_class( - mock_client, self.BUCKET, self.OBJECT_KEY, "GLACIER", dry_run=True + mock_client, self.BUCKET, self.OBJECT_KEY, "STANDARD", "STANDARD_IA", dry_run=True ) assert success is True assert current_class == "STANDARD" - mock_client.head_object.assert_called_once_with(Bucket=self.BUCKET, Key=self.OBJECT_KEY) + # No head_object call needed - storage class already known from list_objects + mock_client.head_object.assert_not_called() mock_client.copy_object.assert_not_called() def test_already_correct_storage_class(self) -> None: - """Test skipping objects already in GLACIER storage class.""" + """Test skipping objects already in STANDARD_IA storage class.""" mock_client = MagicMock() - mock_client.head_object.return_value = {"StorageClass": "GLACIER"} success, current_class = change_object_storage_class( - mock_client, self.BUCKET, self.OBJECT_KEY, "GLACIER", dry_run=False + mock_client, self.BUCKET, self.OBJECT_KEY, "STANDARD_IA", "STANDARD_IA", dry_run=False ) assert success is True - assert current_class == "GLACIER" - mock_client.head_object.assert_called_once_with(Bucket=self.BUCKET, Key=self.OBJECT_KEY) + assert current_class == "STANDARD_IA" + # No API calls needed - already correct storage class + mock_client.head_object.assert_not_called() mock_client.copy_object.assert_not_called() def test_change_storage_class_success(self) -> None: - """Test successful storage class change from STANDARD to GLACIER.""" + """Test successful storage class change from STANDARD to STANDARD_IA.""" mock_client = MagicMock() - mock_client.head_object.return_value = {"StorageClass": "STANDARD"} success, current_class = change_object_storage_class( - mock_client, self.BUCKET, self.OBJECT_KEY, "GLACIER", dry_run=False + mock_client, self.BUCKET, self.OBJECT_KEY, "STANDARD", "STANDARD_IA", dry_run=False ) assert success is True assert current_class == "STANDARD" + # Only copy_object call needed to change storage class + mock_client.head_object.assert_not_called() mock_client.copy_object.assert_called_once_with( Bucket=self.BUCKET, Key=self.OBJECT_KEY, CopySource={"Bucket": self.BUCKET, "Key": self.OBJECT_KEY}, - StorageClass="GLACIER", + StorageClass="STANDARD_IA", MetadataDirective="COPY", ) def test_change_storage_class_error(self) -> None: - """Test handling S3 AccessDenied error.""" + """Test handling S3 AccessDenied error during copy_object.""" from botocore.exceptions import ClientError mock_client = MagicMock() - mock_client.head_object.side_effect = ClientError( + mock_client.copy_object.side_effect = ClientError( {"Error": {"Code": "AccessDenied", "Message": "Access Denied"}}, - "HeadObject", + "CopyObject", ) success, current_class = change_object_storage_class( - mock_client, self.BUCKET, self.OBJECT_KEY, "GLACIER", dry_run=False + mock_client, self.BUCKET, self.OBJECT_KEY, "STANDARD", "STANDARD_IA", dry_run=False ) assert success is False - assert current_class is None + assert current_class == "STANDARD" # Returns the known current class class TestProcessStacItem: @@ -478,7 +502,7 @@ def test_process_stac_item_no_s3_urls( stats = process_stac_item( self.STAC_API_URL, - "GLACIER", + "STANDARD_IA", dry_run=False, s3_endpoint=None, ) @@ -514,14 +538,14 @@ def test_process_stac_item_success( } mock_httpx_client.return_value.__enter__.return_value.get.return_value = mock_response mock_list.return_value = [ - "geozarr/S2A_MSIL2A.zarr/measurements/reflectance/r10m/B02/0", - "geozarr/S2A_MSIL2A.zarr/measurements/reflectance/r10m/B02/.zarray", + ("geozarr/S2A_MSIL2A.zarr/measurements/reflectance/r10m/B02/0", "STANDARD"), + ("geozarr/S2A_MSIL2A.zarr/measurements/reflectance/r10m/B02/.zarray", "STANDARD"), ] mock_change.return_value = (True, "STANDARD") stats = process_stac_item( self.STAC_API_URL, - "GLACIER", + "STANDARD_IA", dry_run=False, s3_endpoint=self.S3_ENDPOINT, ) @@ -557,7 +581,7 @@ def test_main_success(self, mock_process: MagicMock) -> None: "--stac-item-url", self.STAC_API_URL, "--storage-class", - "GLACIER", + "STANDARD_IA", "--dry-run", ] ) @@ -573,7 +597,7 @@ def test_main_with_failures(self, mock_process: MagicMock) -> None: "--stac-item-url", self.STAC_API_URL, "--storage-class", - "GLACIER", + "STANDARD_IA", ] ) assert result == 1 @@ -588,7 +612,7 @@ def test_main_with_patterns(self, mock_process: MagicMock) -> None: "--stac-item-url", self.STAC_API_URL, "--storage-class", - "GLACIER", + "STANDARD_IA", "--include-pattern", "measurements/reflectance/r10m/*", "--exclude-pattern", @@ -603,7 +627,7 @@ def test_main_with_patterns(self, mock_process: MagicMock) -> None: call_args[0] ) assert stac_item_url == self.STAC_API_URL - assert storage_class == "GLACIER" + assert storage_class == "STANDARD_IA" assert dry_run is False assert s3_endpoint is None assert include_patterns == ["measurements/reflectance/r10m/*"]