import os
import pickle
import re
import warnings
import json 

import cartopy.crs as ccrs
import earthaccess
import earthpy as et
import geopandas as gpd
import geoviews as gv
import hvplot.pandas
import hvplot.xarray
import numpy as np
import pandas as pd
import rioxarray as rxr
import rioxarray.merge as rxrmerge
from tqdm.notebook import tqdm
import xarray as xr
from shapely.geometry import Polygon
from sklearn.cluster import KMeans

os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"

warnings.simplefilter('ignore')

c:\Users\Anamaria\miniconda3\envs\earth-analytics-python\Lib\site-packages\dask\dataframe\__init__.py:42: FutureWarning: 
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  warnings.warn(msg, FutureWarning)

@cached(key, override)
def do_something(*args, **kwargs):
    ...
    return item_to_cache

def cached(func_key, override=False):
    """
    A decorator to cache function results
    
    Parameters
    ==========
    key: str
      File basename used to save pickled results
    override: bool
      When True, re-compute even if the results are already stored
    """
    def compute_and_cache_decorator(compute_function): 
        """
        Wrap the caching function
        
        Parameters
        ==========
        compute_function: function
          The function to run and cache results
        """
        def compute_and_cache(*args, **kwargs): 
            """
            Perform a computation and cache, or load cached result.
            
            Parameters
            ==========
            args
              Positional arguments for the compute function
            kwargs
              Keyword arguments for the compute function
            """
            # Add an identifier from the particular function call 
            if 'cache_key' in kwargs:
                key = '_'.join((func_key, kwargs['cache_key']))
            else:
                key = func_key  
            path = os.path.join(
                et.io.HOME, et.io.DATA_NAME, 'jars', f'{key}.pickle') 
                       
            # Check if the cache exists already or override caching 
                # Make jars directory if needed
                os.makedirs(os.path.dirname(path), exist_ok=True)
                
                # Run the compute function as the user did 
                result = compute_function(*args, **kwargs)
                
                # Pickle the object 
                with open(path, 'wb') as file:
                    pickle.dump(result, file)
            else:
                # Unpickle the object 
                with open(path, 'rb') as file:
                    result = pickle.load(file)
                    
            return result 
        
        return compute_and_cache 
    return compute_and_cache_decorator

@cached('wbd_08')                                                    
def read_wbd_file(wbd_filename, huc_level, cache_key): 
    # Download and unzip
    wbd_url = (
        "https://prd-tnm.s3.amazonaws.com"
        "/StagedProducts/Hydrography/WBD/HU2/Shape/"
        f"{wbd_filename}.zip")
    wbd_dir = et.data.get_data(url=wbd_url)                          
                  
    # Read desired data
    wbd_path = os.path.join(wbd_dir, 'Shape', f'WBDHU{huc_level}.shp') 
    wbd_gdf = gpd.read_file(wbd_path, engine='pyogrio')               
    return wbd_gdf                                                     

huc_level = 12                                                         
wbd_gdf = read_wbd_file(
    "WBD_08_HU2_Shape", huc_level, cache_key=f'hu{huc_level}')         

delta_gdf = (
    wbd_gdf[wbd_gdf[f'huc{huc_level}']                                 
    .isin(['080902030506'])]
    .dissolve()                                                        
)

(
    delta_gdf.to_crs(ccrs.Mercator())                                 
    .hvplot(                                                           
        alpha=.2, fill_color='white',                                  
        tiles='EsriImagery', crs=ccrs.Mercator())
    .opts(width=600, height=300)                                      
)

import earthaccess

# log in earthaccess
earthaccess.login(persist=True)

# Ensure that delta_gdf is in geographic coordinates (WGS 84)
if delta_gdf.crs != "EPSG:4326":
    delta_gdf = delta_gdf.to_crs("EPSG:4326")

# define search parameters
short_name = "HLSL30"  
bounding_box = tuple(delta_gdf.total_bounds)  
temporal_range = ("2023-05-01", "2023-10-31")  

# Execute granules search 
results = earthaccess.search_data(
    short_name=short_name,
    cloud_hosted=True,
    bounding_box=bounding_box,
    temporal=temporal_range,
)

# Show the quantity of granules found
print(f"Number of granules found: {len(results)}")

print("Bounding Box:", delta_gdf.total_bounds)

Number of granules found: 88
Bounding Box: [-89.97046834  29.68190731 -89.78679539  29.82339776]

def get_earthaccess_links(results):                                       
    url_re = re.compile(                                                  
        r'\.(?P<tile_id>\w+)\.\d+T\d+\.v\d\.\d\.(?P<band>[A-Za-z0-9]+)\.tif'
    )

    # Loop through each granule
    link_rows = []                                                       
    for granule in tqdm(results):
        # Get granule information
        info_dict = granule['umm']                                        
        granule_id = info_dict['GranuleUR']                               
        datetime = pd.to_datetime(                                        
            info_dict['TemporalExtent']['RangeDateTime']['BeginningDateTime']
        )

        # Extract spatial geometry (bounding polygon)
        try:
            points = (
                info_dict['SpatialExtent']['HorizontalSpatialDomain']['Geometry']['GPolygons'][0]['Boundary']['Points']   
            )
            geometry = Polygon([(point['Longitude'], point['Latitude']) for point in points])
        except KeyError:
            print(f" Warning: No geometry found for granule {granule_id}")    
            continue                                                          

        # Get file URLs
        files = earthaccess.open([granule])                                   

        # Build metadata DataFrame
        for file in files:
            match = url_re.search(file.full_name)
            if match is not None:
                link_rows.append(
                    dict(
                        datetime=datetime,
                        granule_id=granule_id,                            
                        tile_id=match.group('tile_id'),
                        band=match.group('band'),
                        url=str(file),                                    
                        geometry=geometry
                    )
                )

    # Convert to GeoDataFrame
    if link_rows:                                                         
        file_df = gpd.GeoDataFrame(link_rows, crs="EPSG:4326")
        return file_df
    else:
        print("No valid granules found.")
        return None

# Use the function with your search results
granules_gdf = get_earthaccess_links(results)                            

# Check results
unique_granules = granules_gdf['granule_id'].nunique()
print(f"Unique granules processed: {unique_granules}")

  0%|          | 0/88 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

#Show the granules display on an interactive map
granules_gdf.hvplot(geo=True, alpha=0.3, fill_color="red", line_color="black", tiles="EsriImagery")

#Show Box coordinates
print("Bounding Box:", delta_gdf.total_bounds)

Bounding Box: [-89.97046834  29.68190731 -89.78679539  29.82339776]

filtered_granules = granules_gdf.sjoin(delta_gdf, predicate="intersects")
print(f"Granules intersecting the watershed: {filtered_granules['granule_id'].nunique()}")

Granules intersecting the watershed: 88

granules_gdf = granules_gdf.drop_duplicates(subset=['granule_id'])
print(f"Unique granules after removing duplicates: {granules_gdf['granule_id'].nunique()}")

Unique granules after removing duplicates: 88

granules_gdf = granules_gdf.sort_values(by="datetime").drop_duplicates(subset="granule_id", keep="last")
print(f"Granules after keeping latest version: {granules_gdf['granule_id'].nunique()}")

Granules after keeping latest version: 88

# Final Bedugging 
# Step 1: Strict spatial filtering (fully inside watershed)
filtered_granules = granules_gdf.sjoin(delta_gdf, predicate="within")
print(f"Granules fully within the watershed: {filtered_granules['granule_id'].nunique()}")

# Step 2: Remove duplicate versions, keeping the latest
granules_gdf = granules_gdf.sort_values(by="datetime").drop_duplicates(subset="granule_id", keep="last")
print(f"Granules after keeping only the latest version: {granules_gdf['granule_id'].nunique()}")

# Step 3: Check unique dates
print("Unique dates of granules:", granules_gdf["datetime"].unique())

Granules fully within the watershed: 0
Granules after keeping only the latest version: 88
Unique dates of granules: <DatetimeArray>
['2023-05-04 16:31:32.101000+00:00', '2023-05-12 16:31:44.329000+00:00',
 '2023-05-20 16:31:23.029000+00:00', '2023-05-28 16:31:34.837000+00:00',
 '2023-06-05 16:31:28.153000+00:00', '2023-06-13 16:31:26.844000+00:00',
 '2023-06-21 16:31:34.135000+00:00', '2023-06-29 16:31:32.405000+00:00',
 '2023-07-07 16:31:46.664000+00:00', '2023-07-15 16:31:42.442000+00:00',
 '2023-07-23 16:31:50.873000+00:00', '2023-07-31 16:31:47.828000+00:00',
 '2023-08-08 16:31:57.564000+00:00', '2023-08-16 16:31:52.083000+00:00',
 '2023-08-24 16:32:05.861000+00:00', '2023-09-01 16:32:03.072000+00:00',
 '2023-09-09 16:32:06.260000+00:00', '2023-09-17 16:32:11.040000+00:00',
 '2023-09-25 16:32:12.175000+00:00', '2023-10-03 16:32:11.664000+00:00',
 '2023-10-19 16:32:21.176000+00:00', '2023-10-27 16:32:18.830000+00:00']
Length: 22, dtype: datetime64[ns, UTC]

@cached('delta_reflectance_da_df')                        
def compute_reflectance_da(search_results, boundary_gdf): 
    """
    Connect to files over VSI, crop, cloud mask, and wrangle
    
    Returns a single reflectance DataFrame 
    with all bands as columns and
    centroid coordinates and datetime as the index.
    
    Parameters
    ==========
    file_df : pd.DataFrame
        File connection and metadata (datetime, tile_id, band, and url)
    boundary_gdf : gpd.GeoDataFrame
        Boundary use to crop the data
    """
        
    boundary_proj_gdf = boundary_gdf.to_crs("EPSG:32614")

    def open_dataarray(url, boundary_proj_gdf, scale=1, masked=True): 
        # Open masked DataArray /
        da = rxr.open_rasterio(url, masked=masked).squeeze() * scale     
            
        # Crop to watershed boundary 
        return da.rio.clip_box(*boundary_proj_gdf.total_bounds)
        
    def compute_quality_mask(da, mask_bits=[1, 2, 3]):     
        """Mask out low quality data by bit, that means pixel using bit flags."""
        # Unpack bits into a new axis 
        bits = (                                         
            np.unpackbits(
                da.astype(np.uint8), bitorder='little'
            ).reshape(da.shape + (-1,))
        )

        # Select the required bits and check if any are flagged 
        mask = np.prod(bits[..., mask_bits]==0, axis=-1)
        return mask

    file_df = get_earthaccess_links(search_results)                
    granule_da_rows= []                                            
   

    # Loop through each image 
    group_iter = file_df.groupby(['datetime', 'tile_id'])
    for (datetime, tile_id), granule_df in tqdm(group_iter):
        print(f'Processing granule {tile_id} {datetime}')
              
        # Open granule cloud cover
        cloud_mask_url = (
            granule_df.loc[granule_df.band=='Fmask', 'url']
            .values[0])
        cloud_mask_cropped_da = open_dataarray(cloud_mask_url, boundary_proj_gdf, masked=False)

        # Compute cloud mask
        cloud_mask = compute_quality_mask(cloud_mask_cropped_da)    

        #Filter only spectral bands
        band_df = granule_df[granule_df.band.str.startswith('B')]

        # Loop through each spectral band/ cada banda espectral
        for _, row in band_df.iterrows():
            band_cropped = open_dataarray(row.url, scale=0.0001)    
            band_cropped.name = row.band
            row['da'] = band_cropped.where(cloud_mask)             
            granule_da_rows.append(row.to_frame().T)                
    
    # Reassemble the metadata DataFrame
    return pd.concat(rows_list, ignore_index=True)                 
reflectance_da_df = compute_reflectance_da(results, delta_gdf)

# visualize a processed band
reflectance_da_df.iloc[0]['da'].hvplot.image(cmap='viridis')

@cached('delta_reflectance_da')
def merge_and_composite_arrays(granule_da_df):
    """
    Efficiently merges and composites satellite image granules across bands and dates.
    """
    da_list = []

    for band, band_df in tqdm(granule_da_df.groupby('band')):
        # Merge granules per date and mask negatives
        merged_das = [
            rxrmerge.merge_arrays(list(date_df.da)).where(lambda x: x > 0)
            for _, date_df in band_df.groupby('datetime')
        ]
        
        # Composite across dates using the median
        composite_da = xr.concat(merged_das, dim='datetime').median(dim='datetime')

        # Assign band metadata
        composite_da = composite_da.assign_coords(band=int(band[1:])).expand_dims('band')
        composite_da.name = 'reflectance'

        da_list.append(composite_da)

    # Concatenate all bands into a final dataset
    return xr.concat(da_list, dim='band')

reflectance_da = merge_and_composite_arrays(reflectance_da_df)
#reflectance_da

# Convert DataArray to a tidy DataFrame
model_df = reflectance_da.to_dataframe().reset_index()

# Unstack to create a feature matrix (pixels as rows, bands as columns)
model_df = model_df.pivot(index=['y', 'x'], columns='band', values='reflectance')

# Remove rows with all 0s or NaN values
model_df = model_df[(model_df > 0).any(axis=1)].dropna()

# Fit K-Means model
n_clusters = 4  # You can adjust this number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# **Apply K-Means clustering and store results**
model_df['clusters'] = kmeans.fit_predict(model_df)

# Show sample of the clustered data
print(model_df.head())

band                              1        2        3       4        5  \
y            x                                                           
3.287163e+06 793408.062907  0.09700  0.11950  0.16060  0.1753  0.26410   
             793438.062907  0.07180  0.08560  0.12880  0.1328  0.29880   
             793468.062907  0.04585  0.05455  0.09295  0.0868  0.37255   
             793498.062907  0.03710  0.04560  0.08760  0.0718  0.38380   
             793528.062907  0.02330  0.02880  0.06110  0.0380  0.37750   

band                              6        7        9      10       11  \
y            x                                                           
3.287163e+06 793408.062907  0.30230  0.22730  0.00090  0.2905  0.24750   
             793438.062907  0.26040  0.17350  0.00100  0.2860  0.24440   
             793468.062907  0.27145  0.15045  0.00085  0.2778  0.24675   
             793498.062907  0.24620  0.13380  0.00090  0.2695  0.23650   
             793528.062907  0.15420  0.06160  0.00120  0.2636  0.23240   

band                        clusters  
y            x                        
3.287163e+06 793408.062907         3  
             793438.062907         3  
             793468.062907         3  
             793498.062907         3  
             793528.062907         3

# Setect R, G, B y transform to uint8
rgb = reflectance_da.sel(band=[4, 3, 2])
rgb_uint8 = (rgb * 255).astype(np.uint8).where(~np.isnan(rgb), 0)  # avoid NaN

# restore the brigthness with control
rgb_bright = np.clip(rgb_uint8 * 10, 0, 255)  # avoid extrem saturation

# Convert clusters to xarray in correct order
clusters_xr = model_df.clusters.to_xarray().sortby(['x', 'y'])

# Visualize with `hvplot`
plot = (
    rgb_bright.hvplot.rgb(
        x='x', y='y', bands='band',
        data_aspect=1, xaxis=None, yaxis=None
    ) +
    clusters_xr.hvplot(cmap="tab10", aspect='equal')
)

# graphics
plot

Land cover classification at the Mississppi Delta¶

STEP 1: SET UP¶

STEP 2: STUDY SITE¶

DELTA THE MISSISSIPPI¶

LAND COVER CLASSIFICATION AT THE MISSISSIPPI DELTA¶

STEP 3: MULTISPECTRAL DATA¶

Search for data¶

Compile information about each granule¶

Expect Outcome: could some granules were only partially overlapping, the issue duplicate versions, or NASA could update the dataset with extra images. I think that is why be have 88 granules instead of 76 like the tutorial indicates.¶

Open, crop, and mask data¶

Merge and Composite Data¶

STEP 4: K-MEANS¶

STEP 5: PLOT¶

Unsupervised analysis using K-Means for class clustering¶

Introduction¶

Metodology¶

Conclusion¶