Regional Analysis
This example demonstrates how to perform regional hydrological analysis using MarrmotFlow across multiple catchments with varying characteristics.
Overview
Regional analysis in hydrology involves:
Multi-catchment modeling across diverse landscapes
Spatial pattern analysis of hydrological responses
Catchment classification based on characteristics
Regional calibration and parameter transfer
Scaling relationships and regionalization
This example shows how to implement comprehensive regional analysis with MarrmotFlow.
Prerequisites
Regional dataset with multiple catchments:
data/
├── catchments/
│ ├── regional_catchments.shp # Multiple catchments
│ └── catchment_attributes.csv # Physical characteristics
├── climate/
│ ├── gridded_precip_2000_2020.nc
│ └── gridded_temp_2000_2020.nc
├── observations/
│ ├── streamflow_gauges.csv
│ └── gauge_locations.shp
└── dem/
└── regional_dem.tif
Step 1: Setup and Data Loading
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from marrmotflow import MARRMOTWorkflow
# Set up analysis parameters
ANALYSIS_PERIOD = "2000-2020"
MODELS_TO_TEST = {"HBV-96": 7, "GR4J": 37}
Regional Data Management
class RegionalAnalysis:
"""Framework for regional hydrological analysis using MarrmotFlow."""
def __init__(self, catchment_file, attribute_file=None):
self.catchments = gpd.read_file(catchment_file)
self.n_catchments = len(self.catchments)
# Load catchment attributes if provided
if attribute_file:
self.attributes = pd.read_csv(attribute_file)
self.catchments = self.catchments.merge(
self.attributes, on='catchment_id', how='left'
)
self.workflows = {}
self.results = {}
def load_regional_climate(self, precip_file, temp_file):
"""Load gridded climate data covering all catchments."""
self.climate_data = xr.Dataset({
'precipitation': xr.open_dataarray(precip_file),
'temperature': xr.open_dataarray(temp_file)
})
# Verify spatial coverage
self._check_spatial_coverage()
def _check_spatial_coverage(self):
"""Check that climate data covers all catchments."""
cat_bounds = self.catchments.total_bounds
clim_bounds = [
self.climate_data.lon.min().item(),
self.climate_data.lat.min().item(),
self.climate_data.lon.max().item(),
self.climate_data.lat.max().item()
]
if not (clim_bounds[0] <= cat_bounds[0] and
clim_bounds[1] <= cat_bounds[1] and
clim_bounds[2] >= cat_bounds[2] and
clim_bounds[3] >= cat_bounds[3]):
print("Warning: Climate data may not fully cover all catchments")
print(f"Climate data coverage: {clim_bounds}")
print(f"Catchment bounds: {cat_bounds}")
# Initialize regional analysis
regional = RegionalAnalysis(
"data/catchments/regional_catchments.shp",
"data/catchments/catchment_attributes.csv"
)
print(f"Loaded {regional.n_catchments} catchments for regional analysis")
Load and Explore Catchment Characteristics
# Display catchment characteristics
print("Catchment Characteristics Summary:")
if hasattr(regional, 'attributes'):
numeric_cols = regional.catchments.select_dtypes(include=[np.number]).columns
print(regional.catchments[numeric_cols].describe())
# Visualize catchment characteristics
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
characteristics = ['area_km2', 'mean_elevation', 'mean_slope',
'forest_fraction', 'urban_fraction', 'mean_annual_precip']
for i, char in enumerate(characteristics):
if char in regional.catchments.columns:
row, col = i // 3, i % 3
regional.catchments[char].hist(bins=20, ax=axes[row, col], alpha=0.7)
axes[row, col].set_title(f'{char.replace("_", " ").title()}')
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Step 2: Catchment Classification
Classify Based on Physical Characteristics
def classify_catchments(catchments, classification_vars, n_clusters=4):
"""Classify catchments based on physical characteristics."""
# Prepare data for clustering
cluster_data = catchments[classification_vars].dropna()
# Standardize variables
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cluster_data)
# Perform k-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_data)
# Add cluster labels to catchments
catchments_classified = catchments.copy()
catchments_classified.loc[cluster_data.index, 'cluster'] = cluster_labels
# Calculate cluster characteristics
cluster_summary = []
for i in range(n_clusters):
cluster_catchments = catchments_classified[catchments_classified['cluster'] == i]
summary = {
'cluster': i,
'n_catchments': len(cluster_catchments),
**{var: cluster_catchments[var].mean() for var in classification_vars}
}
cluster_summary.append(summary)
cluster_df = pd.DataFrame(cluster_summary)
return catchments_classified, cluster_df, scaler, kmeans
# Classify catchments
classification_vars = ['area_km2', 'mean_elevation', 'mean_slope',
'forest_fraction', 'mean_annual_precip']
available_vars = [var for var in classification_vars if var in regional.catchments.columns]
if len(available_vars) >= 3: # Need at least 3 variables for meaningful classification
catchments_classified, cluster_summary, scaler, kmeans = classify_catchments(
regional.catchments, available_vars, n_clusters=4
)
print("Catchment Classification Results:")
print(cluster_summary)
# Visualize clusters
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Plot clusters on map
catchments_classified.plot(column='cluster', categorical=True,
legend=True, ax=axes[0], cmap='Set1')
axes[0].set_title('Catchment Clusters (Spatial Distribution)')
# Plot cluster characteristics
cluster_summary_melted = cluster_summary.melt(
id_vars=['cluster', 'n_catchments'],
value_vars=available_vars,
var_name='characteristic',
value_name='value'
)
sns.boxplot(data=cluster_summary_melted, x='characteristic', y='value',
hue='cluster', ax=axes[1])
axes[1].set_title('Cluster Characteristics')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
else:
print("Insufficient catchment attributes for classification")
catchments_classified = regional.catchments.copy()
catchments_classified['cluster'] = 0 # Single cluster
Step 3: Multi-Catchment Workflow Creation
Create Individual Catchment Workflows
def create_regional_workflows(catchments, climate_data, models_to_test):
"""Create workflows for all catchments and models."""
workflows = {}
for idx, catchment in catchments.iterrows():
catchment_id = catchment.get('catchment_id', f'catchment_{idx}')
# Extract climate data for this catchment
catchment_climate = extract_catchment_climate(catchment, climate_data)
# Save catchment-specific climate data
climate_file = f"temp_climate_{catchment_id}.nc"
catchment_climate.to_netcdf(climate_file)
# Create single-catchment GeoDataFrame
single_catchment = gpd.GeoDataFrame([catchment], crs=catchments.crs)
for model_name, model_number in models_to_test.items():
workflow_name = f"{catchment_id}_{model_name}"
try:
workflows[workflow_name] = MARRMOTWorkflow(
name=workflow_name,
cat=single_catchment,
forcing_files=climate_file,
forcing_vars={"precip": "precipitation", "temp": "temperature"},
forcing_units={"precip": "mm/day", "temp": "celsius"},
model_number=model_number,
pet_method="penman_monteith"
)
except Exception as e:
print(f"Warning: Could not create workflow for {workflow_name}: {e}")
return workflows
def extract_catchment_climate(catchment, climate_data):
"""Extract climate data for a specific catchment."""
# Get catchment centroid for point extraction
centroid = catchment.geometry.centroid
lon, lat = centroid.x, centroid.y
# Find nearest grid points
lon_idx = np.argmin(np.abs(climate_data.lon - lon))
lat_idx = np.argmin(np.abs(climate_data.lat - lat))
# Extract point data
point_data = climate_data.isel(lon=lon_idx, lat=lat_idx)
return point_data
# Load regional climate data
regional.load_regional_climate(
"data/climate/gridded_precip_2000_2020.nc",
"data/climate/gridded_temp_2000_2020.nc"
)
# Create workflows for all catchments
regional.workflows = create_regional_workflows(
catchments_classified, regional.climate_data, MODELS_TO_TEST
)
print(f"Created {len(regional.workflows)} workflows for regional analysis")
print(f"Workflows per catchment: {len(MODELS_TO_TEST)}")
Step 4: Simulated Regional Results Analysis
Generate Mock Regional Results
def generate_regional_results(catchments, models_to_test, analysis_period):
"""Generate mock regional results for demonstration."""
start_date = f"{analysis_period.split('-')[0]}-01-01"
end_date = f"{analysis_period.split('-')[1]}-12-31"
dates = pd.date_range(start_date, end_date, freq='D')
results = {}
for idx, catchment in catchments.iterrows():
catchment_id = catchment.get('catchment_id', f'catchment_{idx}')
# Catchment-specific characteristics affecting discharge
area = catchment.get('area_km2', 100)
elevation = catchment.get('mean_elevation', 500)
forest = catchment.get('forest_fraction', 0.5)
# Base discharge influenced by catchment characteristics
base_discharge = (
2.0 * (area / 100)**0.3 * # Size effect
(elevation / 1000)**0.2 * # Elevation effect
(forest + 0.5) # Vegetation effect
)
# Seasonal pattern
seasonal = 0.5 * np.sin(2 * np.pi * np.arange(len(dates)) / 365.25)
for model_name, model_number in models_to_test.items():
# Model-specific bias
model_bias = {"HBV-96": 0.1, "GR4J": -0.05}[model_name]
# Random variation
noise = np.random.normal(0, 0.2, len(dates))
# Combine effects
discharge = np.maximum(0, base_discharge + seasonal + model_bias + noise)
workflow_name = f"{catchment_id}_{model_name}"
results[workflow_name] = {
'discharge': pd.Series(discharge, index=dates, name='discharge'),
'catchment_id': catchment_id,
'model': model_name,
'catchment_area': area,
'catchment_elevation': elevation
}
return results
# Generate regional results
regional.results = generate_regional_results(
catchments_classified, MODELS_TO_TEST, ANALYSIS_PERIOD
)
print(f"Generated results for {len(regional.results)} catchment-model combinations")
Step 5: Regional Pattern Analysis
Analyze Spatial Patterns
def analyze_regional_patterns(results, catchments):
"""Analyze spatial patterns in hydrological variables."""
# Extract catchment-level statistics
catchment_stats = {}
for workflow_name, result in results.items():
catchment_id = result['catchment_id']
model = result['model']
discharge = result['discharge']
if catchment_id not in catchment_stats:
catchment_stats[catchment_id] = {}
# Calculate statistics
stats = {
'mean_annual_discharge': discharge.groupby(discharge.index.year).sum().mean(),
'cv_annual': discharge.groupby(discharge.index.year).sum().std() /
discharge.groupby(discharge.index.year).sum().mean(),
'mean_daily_discharge': discharge.mean(),
'q95': discharge.quantile(0.95),
'q05': discharge.quantile(0.05),
'baseflow_index': estimate_baseflow_index(discharge)
}
catchment_stats[catchment_id][model] = stats
return catchment_stats
def estimate_baseflow_index(discharge):
"""Estimate baseflow index using a simple approach."""
# Simple baseflow separation: 7-day minimum rolling window
baseflow = discharge.rolling(window=7, min_periods=1).min()
return baseflow.mean() / discharge.mean()
def plot_regional_patterns(catchment_stats, catchments):
"""Plot regional patterns in hydrological responses."""
# Prepare data for plotting
plot_data = []
for catchment_id, model_stats in catchment_stats.items():
for model, stats in model_stats.items():
plot_data.append({
'catchment_id': catchment_id,
'model': model,
**stats
})
plot_df = pd.DataFrame(plot_data)
# Merge with catchment characteristics
catchment_attrs = catchments.drop('geometry', axis=1)
plot_df = plot_df.merge(catchment_attrs, on='catchment_id', how='left')
# Create subplots for different relationships
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
# Mean discharge vs. area
for model in MODELS_TO_TEST.keys():
model_data = plot_df[plot_df['model'] == model]
axes[0,0].scatter(model_data['area_km2'], model_data['mean_annual_discharge'],
label=model, alpha=0.7)
axes[0,0].set_xlabel('Catchment Area (km²)')
axes[0,0].set_ylabel('Mean Annual Discharge (mm)')
axes[0,0].set_title('Discharge vs. Catchment Area')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)
# Discharge vs. elevation
for model in MODELS_TO_TEST.keys():
model_data = plot_df[plot_df['model'] == model]
axes[0,1].scatter(model_data['mean_elevation'], model_data['mean_annual_discharge'],
label=model, alpha=0.7)
axes[0,1].set_xlabel('Mean Elevation (m)')
axes[0,1].set_ylabel('Mean Annual Discharge (mm)')
axes[0,1].set_title('Discharge vs. Elevation')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)
# Coefficient of variation vs. forest fraction
if 'forest_fraction' in plot_df.columns:
for model in MODELS_TO_TEST.keys():
model_data = plot_df[plot_df['model'] == model]
axes[0,2].scatter(model_data['forest_fraction'], model_data['cv_annual'],
label=model, alpha=0.7)
axes[0,2].set_xlabel('Forest Fraction')
axes[0,2].set_ylabel('CV of Annual Discharge')
axes[0,2].set_title('Variability vs. Forest Cover')
axes[0,2].legend()
axes[0,2].grid(True, alpha=0.3)
# Baseflow index vs. characteristics
for model in MODELS_TO_TEST.keys():
model_data = plot_df[plot_df['model'] == model]
axes[1,0].scatter(model_data['area_km2'], model_data['baseflow_index'],
label=model, alpha=0.7)
axes[1,0].set_xlabel('Catchment Area (km²)')
axes[1,0].set_ylabel('Baseflow Index')
axes[1,0].set_title('Baseflow vs. Area')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)
# Model comparison
model_comparison = plot_df.pivot_table(
values='mean_annual_discharge',
index='catchment_id',
columns='model'
)
if len(MODELS_TO_TEST) == 2:
model_names = list(MODELS_TO_TEST.keys())
axes[1,1].scatter(model_comparison[model_names[0]],
model_comparison[model_names[1]], alpha=0.7)
axes[1,1].plot([0, model_comparison.values.max()], [0, model_comparison.values.max()],
'r--', alpha=0.5)
axes[1,1].set_xlabel(f'{model_names[0]} Discharge (mm)')
axes[1,1].set_ylabel(f'{model_names[1]} Discharge (mm)')
axes[1,1].set_title('Model Comparison')
axes[1,1].grid(True, alpha=0.3)
# Regional statistics summary
regional_summary = plot_df.groupby('model').agg({
'mean_annual_discharge': ['mean', 'std'],
'cv_annual': ['mean', 'std'],
'baseflow_index': ['mean', 'std']
}).round(3)
# Display summary as text
axes[1,2].axis('off')
summary_text = "Regional Summary:\n\n"
for model in MODELS_TO_TEST.keys():
summary_text += f"{model}:\n"
summary_text += f" Mean Discharge: {regional_summary.loc[model, ('mean_annual_discharge', 'mean')]:.1f} ± {regional_summary.loc[model, ('mean_annual_discharge', 'std')]:.1f} mm\n"
summary_text += f" CV Annual: {regional_summary.loc[model, ('cv_annual', 'mean')]:.3f} ± {regional_summary.loc[model, ('cv_annual', 'std')]:.3f}\n"
summary_text += f" Baseflow Index: {regional_summary.loc[model, ('baseflow_index', 'mean')]:.3f} ± {regional_summary.loc[model, ('baseflow_index', 'std')]:.3f}\n\n"
axes[1,2].text(0.1, 0.9, summary_text, transform=axes[1,2].transAxes,
fontsize=10, verticalalignment='top', fontfamily='monospace')
plt.tight_layout()
plt.show()
return plot_df
# Analyze regional patterns
catchment_statistics = analyze_regional_patterns(regional.results, catchments_classified)
regional_patterns = plot_regional_patterns(catchment_statistics, catchments_classified)
Step 6: Model Performance by Region
Assess Model Performance Across Clusters
def assess_regional_model_performance(regional_patterns, catchments_classified):
"""Assess model performance by catchment clusters."""
# Merge with cluster information
cluster_info = catchments_classified[['catchment_id', 'cluster']].copy()
performance_data = regional_patterns.merge(cluster_info, on='catchment_id', how='left')
# Calculate performance metrics by cluster
cluster_performance = {}
for cluster_id in performance_data['cluster'].unique():
if pd.isna(cluster_id):
continue
cluster_data = performance_data[performance_data['cluster'] == cluster_id]
cluster_performance[cluster_id] = {
'n_catchments': len(cluster_data) // len(MODELS_TO_TEST),
'model_stats': {}
}
for model in MODELS_TO_TEST.keys():
model_data = cluster_data[cluster_data['model'] == model]
cluster_performance[cluster_id]['model_stats'][model] = {
'mean_discharge': model_data['mean_annual_discharge'].mean(),
'std_discharge': model_data['mean_annual_discharge'].std(),
'mean_cv': model_data['cv_annual'].mean(),
'mean_baseflow': model_data['baseflow_index'].mean()
}
return cluster_performance, performance_data
def plot_cluster_performance(cluster_performance, performance_data):
"""Plot model performance by catchment clusters."""
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# Prepare data for box plots
discharge_data = []
cv_data = []
baseflow_data = []
for _, row in performance_data.iterrows():
if not pd.isna(row['cluster']):
discharge_data.append({
'cluster': int(row['cluster']),
'model': row['model'],
'mean_annual_discharge': row['mean_annual_discharge']
})
cv_data.append({
'cluster': int(row['cluster']),
'model': row['model'],
'cv_annual': row['cv_annual']
})
baseflow_data.append({
'cluster': int(row['cluster']),
'model': row['model'],
'baseflow_index': row['baseflow_index']
})
discharge_df = pd.DataFrame(discharge_data)
cv_df = pd.DataFrame(cv_data)
baseflow_df = pd.DataFrame(baseflow_data)
# Box plots by cluster
sns.boxplot(data=discharge_df, x='cluster', y='mean_annual_discharge',
hue='model', ax=axes[0,0])
axes[0,0].set_title('Annual Discharge by Cluster')
axes[0,0].set_ylabel('Mean Annual Discharge (mm)')
sns.boxplot(data=cv_df, x='cluster', y='cv_annual',
hue='model', ax=axes[0,1])
axes[0,1].set_title('Discharge Variability by Cluster')
axes[0,1].set_ylabel('CV Annual Discharge')
sns.boxplot(data=baseflow_df, x='cluster', y='baseflow_index',
hue='model', ax=axes[1,0])
axes[1,0].set_title('Baseflow Index by Cluster')
axes[1,0].set_ylabel('Baseflow Index')
# Model agreement by cluster
model_agreement = []
for cluster_id, cluster_data in cluster_performance.items():
models = list(cluster_data['model_stats'].keys())
if len(models) == 2:
model1_discharge = cluster_data['model_stats'][models[0]]['mean_discharge']
model2_discharge = cluster_data['model_stats'][models[1]]['mean_discharge']
agreement = abs(model1_discharge - model2_discharge) / max(model1_discharge, model2_discharge)
model_agreement.append({'cluster': cluster_id, 'disagreement': agreement})
if model_agreement:
agreement_df = pd.DataFrame(model_agreement)
agreement_df['cluster'].value_counts().sort_index().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Number of Catchments by Cluster')
axes[1,1].set_ylabel('Number of Catchments')
axes[1,1].tick_params(axis='x', rotation=0)
plt.tight_layout()
plt.show()
return cluster_performance
# Assess model performance by region
cluster_perf, perf_data = assess_regional_model_performance(regional_patterns, catchments_classified)
cluster_performance = plot_cluster_performance(cluster_perf, perf_data)
print("Regional Model Performance Summary:")
for cluster_id, cluster_data in cluster_perf.items():
print(f"\nCluster {cluster_id} ({cluster_data['n_catchments']} catchments):")
for model, stats in cluster_data['model_stats'].items():
print(f" {model}:")
print(f" Mean discharge: {stats['mean_discharge']:.1f} ± {stats['std_discharge']:.1f} mm")
print(f" Mean CV: {stats['mean_cv']:.3f}")
print(f" Mean baseflow index: {stats['mean_baseflow']:.3f}")
Step 7: Regionalization and Parameter Transfer
Develop Regional Relationships
def develop_regional_relationships(regional_patterns, catchments_classified):
"""Develop relationships for parameter regionalization."""
# Focus on key hydrological signatures
signatures = ['mean_annual_discharge', 'cv_annual', 'baseflow_index', 'q95']
predictors = ['area_km2', 'mean_elevation', 'mean_slope', 'forest_fraction']
# Available predictors in dataset
available_predictors = [p for p in predictors if p in catchments_classified.columns]
relationships = {}
for model in MODELS_TO_TEST.keys():
model_data = regional_patterns[regional_patterns['model'] == model].copy()
# Merge with catchment characteristics
model_data = model_data.merge(
catchments_classified[['catchment_id'] + available_predictors],
on='catchment_id', how='left'
)
relationships[model] = {}
for signature in signatures:
if signature in model_data.columns:
relationships[model][signature] = {}
for predictor in available_predictors:
# Calculate correlation
valid_data = model_data[[signature, predictor]].dropna()
if len(valid_data) > 3:
correlation, p_value = stats.pearsonr(
valid_data[signature], valid_data[predictor]
)
# Simple linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(
valid_data[predictor], valid_data[signature]
)
relationships[model][signature][predictor] = {
'correlation': correlation,
'r_squared': r_value**2,
'slope': slope,
'intercept': intercept,
'p_value': p_value,
'n_samples': len(valid_data)
}
return relationships
def plot_regional_relationships(relationships, regional_patterns, catchments_classified):
"""Plot key regional relationships."""
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
# Select best relationships to plot
plot_configs = [
('mean_annual_discharge', 'area_km2', 'Discharge vs Area'),
('mean_annual_discharge', 'mean_elevation', 'Discharge vs Elevation'),
('cv_annual', 'forest_fraction', 'CV vs Forest Fraction'),
('baseflow_index', 'area_km2', 'Baseflow vs Area'),
('q95', 'mean_elevation', 'High Flows vs Elevation'),
('cv_annual', 'mean_slope', 'CV vs Slope')
]
for i, (signature, predictor, title) in enumerate(plot_configs):
if i >= 6: # Only 6 subplots
break
row, col = i // 3, i % 3
for model in MODELS_TO_TEST.keys():
model_data = regional_patterns[regional_patterns['model'] == model].copy()
model_data = model_data.merge(
catchments_classified[['catchment_id', predictor]],
on='catchment_id', how='left'
)
if signature in model_data.columns and predictor in model_data.columns:
valid_data = model_data[[signature, predictor]].dropna()
if len(valid_data) > 0:
axes[row, col].scatter(valid_data[predictor], valid_data[signature],
label=model, alpha=0.7)
# Add regression line if relationship exists
if (model in relationships and
signature in relationships[model] and
predictor in relationships[model][signature]):
rel = relationships[model][signature][predictor]
if rel['p_value'] < 0.05: # Significant relationship
x_range = np.linspace(valid_data[predictor].min(),
valid_data[predictor].max(), 100)
y_pred = rel['slope'] * x_range + rel['intercept']
axes[row, col].plot(x_range, y_pred, '--', alpha=0.8)
axes[row, col].set_xlabel(predictor.replace('_', ' ').title())
axes[row, col].set_ylabel(signature.replace('_', ' ').title())
axes[row, col].set_title(title)
axes[row, col].legend()
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Develop regional relationships
regional_relationships = develop_regional_relationships(regional_patterns, catchments_classified)
plot_regional_relationships(regional_relationships, regional_patterns, catchments_classified)
# Print significant relationships
print("Significant Regional Relationships (p < 0.05):")
for model, signatures in regional_relationships.items():
print(f"\n{model}:")
for signature, predictors in signatures.items():
for predictor, stats in predictors.items():
if stats['p_value'] < 0.05 and stats['r_squared'] > 0.1:
print(f" {signature} vs {predictor}: R² = {stats['r_squared']:.3f}, p = {stats['p_value']:.3f}")
Step 8: Regional Summary and Validation
Cross-Validation of Regional Models
def cross_validate_regional_model(relationships, regional_patterns, catchments_classified, signature, predictor):
"""Perform leave-one-out cross-validation for regional relationships."""
results = {}
for model in MODELS_TO_TEST.keys():
if (model in relationships and
signature in relationships[model] and
predictor in relationships[model][signature]):
model_data = regional_patterns[regional_patterns['model'] == model].copy()
model_data = model_data.merge(
catchments_classified[['catchment_id', predictor]],
on='catchment_id', how='left'
)
valid_data = model_data[[signature, predictor, 'catchment_id']].dropna()
if len(valid_data) > 5: # Need sufficient data for CV
predictions = []
observations = []
for i in range(len(valid_data)):
# Leave one out
train_data = valid_data.drop(valid_data.index[i])
test_data = valid_data.iloc[i]
# Fit model on training data
slope, intercept, _, _, _ = stats.linregress(
train_data[predictor], train_data[signature]
)
# Predict on test data
prediction = slope * test_data[predictor] + intercept
predictions.append(prediction)
observations.append(test_data[signature])
# Calculate validation statistics
predictions = np.array(predictions)
observations = np.array(observations)
cv_r2 = stats.pearsonr(observations, predictions)[0]**2
cv_rmse = np.sqrt(np.mean((observations - predictions)**2))
cv_bias = np.mean(predictions - observations)
results[model] = {
'cv_r2': cv_r2,
'cv_rmse': cv_rmse,
'cv_bias': cv_bias,
'n_samples': len(valid_data)
}
return results
# Validate key relationships
validation_results = {}
key_relationships = [
('mean_annual_discharge', 'area_km2'),
('baseflow_index', 'forest_fraction'),
('cv_annual', 'mean_elevation')
]
for signature, predictor in key_relationships:
if predictor in catchments_classified.columns:
cv_results = cross_validate_regional_model(
regional_relationships, regional_patterns,
catchments_classified, signature, predictor
)
if cv_results:
validation_results[f"{signature}_vs_{predictor}"] = cv_results
print("Cross-Validation Results:")
for relationship, models in validation_results.items():
print(f"\n{relationship}:")
for model, stats in models.items():
print(f" {model}: R² = {stats['cv_r2']:.3f}, RMSE = {stats['cv_rmse']:.3f}, Bias = {stats['cv_bias']:.3f}")
Generate Regional Summary Report
def generate_regional_report(regional_patterns, cluster_perf, regional_relationships, validation_results):
"""Generate comprehensive regional analysis report."""
report = {
'summary_statistics': {},
'regional_patterns': {},
'model_performance': {},
'regionalization': {},
'recommendations': []
}
# Summary statistics
for model in MODELS_TO_TEST.keys():
model_data = regional_patterns[regional_patterns['model'] == model]
report['summary_statistics'][model] = {
'n_catchments': len(model_data),
'mean_discharge_range': [model_data['mean_annual_discharge'].min(),
model_data['mean_annual_discharge'].max()],
'mean_discharge_mean': model_data['mean_annual_discharge'].mean(),
'cv_range': [model_data['cv_annual'].min(), model_data['cv_annual'].max()],
'baseflow_range': [model_data['baseflow_index'].min(), model_data['baseflow_index'].max()]
}
# Regional patterns
report['regional_patterns'] = {
'n_clusters': len(cluster_perf),
'cluster_characteristics': cluster_perf
}
# Regionalization success
successful_relationships = 0
total_relationships = 0
for model, signatures in regional_relationships.items():
for signature, predictors in signatures.items():
for predictor, stats in predictors.items():
total_relationships += 1
if stats['p_value'] < 0.05 and stats['r_squared'] > 0.1:
successful_relationships += 1
report['regionalization'] = {
'successful_relationships': successful_relationships,
'total_relationships': total_relationships,
'success_rate': successful_relationships / total_relationships if total_relationships > 0 else 0,
'validation_results': validation_results
}
# Recommendations
report['recommendations'] = [
f"Analyzed {len(regional_patterns)//len(MODELS_TO_TEST)} catchments across {len(cluster_perf)} clusters",
f"Found {successful_relationships} significant relationships out of {total_relationships} tested",
"Consider cluster-specific model selection for improved performance",
"Develop parameter transfer functions for ungauged catchments",
"Validate regional relationships with additional data when available"
]
return report
# Generate final report
regional_report = generate_regional_report(
regional_patterns, cluster_perf, regional_relationships, validation_results
)
print("REGIONAL ANALYSIS REPORT")
print("=" * 40)
print(f"Analysis Period: {ANALYSIS_PERIOD}")
print(f"Models Tested: {list(MODELS_TO_TEST.keys())}")
print(f"Catchments Analyzed: {regional_report['summary_statistics'][list(MODELS_TO_TEST.keys())[0]]['n_catchments']}")
print(f"Clusters Identified: {regional_report['regional_patterns']['n_clusters']}")
print(f"Regionalization Success Rate: {regional_report['regionalization']['success_rate']:.1%}")
for recommendation in regional_report['recommendations']:
print(f"• {recommendation}")
Complete Regional Analysis Script
#!/usr/bin/env python3
"""
Complete regional hydrological analysis using MarrmotFlow
"""
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
from marrmotflow import MARRMOTWorkflow
def main():
# Configuration
MODELS_TO_TEST = {"HBV-96": 7, "GR4J": 37}
# Load regional dataset
catchments = gpd.read_file("data/catchments/regional_catchments.shp")
print("Regional Hydrological Analysis")
print("=" * 35)
print(f"Number of catchments: {len(catchments)}")
print(f"Spatial extent: {catchments.total_bounds}")
print(f"Models to test: {list(MODELS_TO_TEST.keys())}")
# Create workflows for all catchments
workflows = {}
for idx, catchment in catchments.iterrows():
catchment_id = catchment.get('catchment_id', f'catchment_{idx}')
single_catchment = gpd.GeoDataFrame([catchment], crs=catchments.crs)
for model_name, model_number in MODELS_TO_TEST.items():
workflow_name = f"{catchment_id}_{model_name}"
workflows[workflow_name] = MARRMOTWorkflow(
name=workflow_name,
cat=single_catchment,
forcing_files="data/climate/gridded_climate.nc",
forcing_vars={"precip": "precipitation", "temp": "temperature"},
forcing_units={"precip": "mm/day", "temp": "celsius"},
model_number=model_number,
pet_method="penman_monteith"
)
print(f"\nCreated {len(workflows)} workflows for regional analysis")
print("Ready for regional hydrological modeling!")
if __name__ == "__main__":
main()
Key Insights from Regional Analysis
This comprehensive example demonstrates:
Spatial heterogeneity: Hydrological responses vary systematically across regions
Catchment classification: Physical characteristics can group similar catchments
Model transferability: Different models may perform better in different regions
Scaling relationships: Discharge patterns relate to catchment characteristics
Regionalization potential: Parameter transfer functions can be developed
Applications
Ungauged basin prediction: Transfer parameters to similar catchments
Model selection guidance: Choose appropriate models for different regions
Water resource assessment: Understand regional water availability patterns
Climate impact assessment: Apply regional relationships to future scenarios
Monitoring network design: Identify representative catchments for monitoring