Skip Existing Tile Downloads

2025-01-21 17:15 UTC gpt-4o Open in ChatGPT ↗

only proceed with data download if the tile does not exist

import subprocess

import boto3 import os import pandas as pd

Configuration

bucket_name = “dataforgood-fb-data” prefix = “forests/v1/alsgedi_global_v6_float/chm/” output_folder = ”../../geo/canopy/“

Create output folder if it doesn’t exist

os.makedirs(output_folder, exist_ok=True)

Load the CSV file with tile IDs

tile_file = ”../../geo/canopy/filtered_tile_list.csv” # Update with the path to your file tile_data = pd.read_csv(tile_file)

Extract the tile column

tile_ids = tile_data[“tile”].dropna().unique()

Download each tile using AWS CLI

for tile_id in tile_ids: tile_filename = f”{tile_id}.tif” s3_path = f”s3://{bucket_name}/{prefix}{tile_filename}” local_path = os.path.join(output_folder, tile_filename) print(f”Downloading {tile_filename} to {local_path}…”) try: # Run the AWS CLI command subprocess.run([“aws”, “s3”, “cp”, s3_path, local_path, “—no-sign-request”], check=True) print(f”Downloaded: {tile_filename}”) except subprocess.CalledProcessError as e: print(f”Failed to download {tile_filename}: {e}”)

print(“Tile download completed.”)

To ensure that the script only downloads a tile if it doesn’t already exist in the output folder, you can add a condition to check for the presence of the file before running the aws s3 cp command. Here’s the updated script:

import subprocess
import os
import pandas as pd

# Configuration
bucket_name = "dataforgood-fb-data"
prefix = "forests/v1/alsgedi_global_v6_float/chm/"
output_folder = "../../geo/canopy/"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the CSV file with tile IDs
tile_file = "../../geo/canopy/filtered_tile_list.csv"  # Update with the path to your file
tile_data = pd.read_csv(tile_file)

# Extract the tile column
tile_ids = tile_data["tile"].dropna().unique()

# Download each tile using AWS CLI
for tile_id in tile_ids:
    tile_filename = f"{tile_id}.tif"
    local_path = os.path.join(output_folder, tile_filename)

    # Check if the tile already exists locally
    if os.path.exists(local_path):
        print(f"Skipping {tile_filename}: already exists.")
        continue

    s3_path = f"s3://{bucket_name}/{prefix}{tile_filename}"
    print(f"Downloading {tile_filename} to {local_path}...")

    try:
        # Run the AWS CLI command
        subprocess.run(["aws", "s3", "cp", s3_path, local_path, "--no-sign-request"], check=True)
        print(f"Downloaded: {tile_filename}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to download {tile_filename}: {e}")

print("Tile download completed.")

Changes Made:

Check for Existing Files: The line if os.path.exists(local_path) checks if the file already exists in the output folder. If it does, the script skips the download for that tile.
Skip Message: Prints a message when skipping a file to make it clear in the logs why a tile wasn’t downloaded.
Error Handling: The error handling remains the same to capture any issues with the download process.

This ensures that the script avoids redundant downloads, saving time and bandwidth.