Multi-Model Comparison
This example demonstrates how to compare multiple MARRMOT models using MarrmotFlow to assess model uncertainty and performance differences.
Overview
In this example, we will:
Set up multiple model configurations
Run HBV-96, GR4J, and other models
Compare model outputs and performance
Analyze model agreement and uncertainty
Visualize comparative results
Prerequisites
Multiple catchments for robust comparison
Quality-controlled forcing data
Optional: Observed discharge data for validation
# Example data structure
data/
├── catchments/
│ ├── basin_001.shp
│ ├── basin_002.shp
│ └── basin_003.shp
├── climate/
│ ├── era5_precip_2010_2020.nc
│ └── era5_temp_2010_2020.nc
└── observations/
└── discharge_observations.csv
Step 1: Setup and Data Loading
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from marrmotflow import MARRMOTWorkflow
from marrmotflow._default_dicts import default_model_dict
# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
Load Multiple Catchments
# Load multiple catchment files
import glob
catchment_files = glob.glob("data/catchments/*.shp")
catchments = []
for file in catchment_files:
gdf = gpd.read_file(file)
catchments.append(gdf)
# Combine into single GeoDataFrame
all_catchments = gpd.GeoDataFrame(pd.concat(catchments, ignore_index=True))
print(f"Loaded {len(all_catchments)} catchments")
print(f"Catchment IDs: {all_catchments['id'].tolist() if 'id' in all_catchments.columns else 'No ID column'}")
Load Climate Data
# Load climate forcing data
precip_data = xr.open_dataset("data/climate/era5_precip_2010_2020.nc")
temp_data = xr.open_dataset("data/climate/era5_temp_2010_2020.nc")
# Combine datasets
climate_data = xr.merge([precip_data, temp_data])
print("Climate data loaded:")
print(f"Variables: {list(climate_data.data_vars)}")
print(f"Time range: {climate_data.time.min().item()} to {climate_data.time.max().item()}")
Step 2: Define Model Comparison Setup
Model Selection
# Select models for comparison
models_to_compare = {
"HBV-96": 7,
"GR4J": 37,
"Collie1": 1,
"Wetland": 2
}
print("Models selected for comparison:")
for name, number in models_to_compare.items():
model_info = default_model_dict.get(number, {"name": "Unknown", "description": "N/A"})
print(f" {name} (Model {number}): {model_info['description']}")
Common Configuration
# Common workflow configuration
common_config = {
"cat": all_catchments,
"forcing_files": [
"data/climate/era5_precip_2010_2020.nc",
"data/climate/era5_temp_2010_2020.nc"
],
"forcing_vars": {
"precip": "total_precipitation",
"temp": "2m_temperature"
},
"forcing_units": {
"precip": "m/day", # ERA5 units
"temp": "kelvin" # ERA5 units
},
"pet_method": "penman_monteith",
"forcing_time_zone": "UTC",
"model_time_zone": "America/Vancouver"
}
Step 3: Create Multiple Workflows
# Create workflow for each model
workflows = {}
for model_name, model_number in models_to_compare.items():
print(f"Creating workflow for {model_name}...")
workflows[model_name] = MARRMOTWorkflow(
name=f"Comparison_{model_name}",
model_number=model_number,
**common_config
)
print(f"Created {len(workflows)} workflows for model comparison")
Step 4: Expected Results Analysis
Note
This section shows how to analyze results once workflows are executed. The actual execution method depends on the current MarrmotFlow implementation.
Result Structure
# Expected structure for comparison results
def create_mock_results():
"""Create mock results for demonstration purposes."""
# Generate sample time series
dates = pd.date_range('2010-01-01', '2020-12-31', freq='D')
n_days = len(dates)
results = {}
for model_name, model_number in models_to_compare.items():
# Generate realistic discharge patterns
base_discharge = 2.0 + 0.5 * np.sin(2 * np.pi * np.arange(n_days) / 365.25)
noise = np.random.normal(0, 0.3, n_days)
model_bias = {"HBV-96": 0.1, "GR4J": -0.05, "Collie1": 0.0, "Wetland": 0.15}.get(model_name, 0)
discharge = np.maximum(0, base_discharge + noise + model_bias)
results[model_name] = {
'discharge': xr.DataArray(
discharge,
dims=['time'],
coords={'time': dates},
attrs={'units': 'mm/day', 'model': model_name}
)
}
return results
# For demonstration purposes
results = create_mock_results()
Step 5: Model Comparison Analysis
Time Series Comparison
def plot_discharge_comparison(results, start_date='2015-01-01', end_date='2016-12-31'):
"""Compare discharge time series from multiple models."""
fig, axes = plt.subplots(2, 1, figsize=(15, 10))
# Time series plot
for model_name, model_results in results.items():
discharge = model_results['discharge'].sel(time=slice(start_date, end_date))
discharge.plot(ax=axes[0], label=model_name, alpha=0.8)
axes[0].set_title('Model Discharge Comparison (2015-2016)')
axes[0].set_ylabel('Discharge (mm/day)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Box plot for seasonal comparison
seasonal_data = []
model_names = []
seasons = []
for model_name, model_results in results.items():
discharge = model_results['discharge'].sel(time=slice(start_date, end_date))
df = discharge.to_dataframe(name='discharge')
df['season'] = df.index.month.map({
12: 'Winter', 1: 'Winter', 2: 'Winter',
3: 'Spring', 4: 'Spring', 5: 'Spring',
6: 'Summer', 7: 'Summer', 8: 'Summer',
9: 'Fall', 10: 'Fall', 11: 'Fall'
})
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
season_data = df[df['season'] == season]['discharge']
seasonal_data.extend(season_data.values)
model_names.extend([model_name] * len(season_data))
seasons.extend([season] * len(season_data))
comparison_df = pd.DataFrame({
'discharge': seasonal_data,
'model': model_names,
'season': seasons
})
sns.boxplot(data=comparison_df, x='season', y='discharge', hue='model', ax=axes[1])
axes[1].set_title('Seasonal Discharge Distribution by Model')
axes[1].set_ylabel('Discharge (mm/day)')
plt.tight_layout()
plt.show()
# Plot comparison
plot_discharge_comparison(results)
Statistical Comparison
def calculate_model_statistics(results):
"""Calculate comparative statistics for all models."""
stats_dict = {}
for model_name, model_results in results.items():
discharge = model_results['discharge']
stats = {
'mean': discharge.mean().item(),
'std': discharge.std().item(),
'min': discharge.min().item(),
'max': discharge.max().item(),
'q25': discharge.quantile(0.25).item(),
'q50': discharge.quantile(0.50).item(),
'q75': discharge.quantile(0.75).item(),
'cv': (discharge.std() / discharge.mean()).item()
}
stats_dict[model_name] = stats
# Convert to DataFrame for easy comparison
stats_df = pd.DataFrame(stats_dict).T
return stats_df
# Calculate and display statistics
model_stats = calculate_model_statistics(results)
print("Model Comparison Statistics:")
print(model_stats.round(3))
# Visualize statistics
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Mean discharge
model_stats['mean'].plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Mean Annual Discharge')
axes[0,0].set_ylabel('Discharge (mm/day)')
axes[0,0].tick_params(axis='x', rotation=45)
# Standard deviation
model_stats['std'].plot(kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('Discharge Variability (Std Dev)')
axes[0,1].set_ylabel('Standard Deviation (mm/day)')
axes[0,1].tick_params(axis='x', rotation=45)
# Coefficient of variation
model_stats['cv'].plot(kind='bar', ax=axes[1,0], color='lightgreen')
axes[1,0].set_title('Coefficient of Variation')
axes[1,0].set_ylabel('CV (-)')
axes[1,0].tick_params(axis='x', rotation=45)
# Range (max - min)
discharge_range = model_stats['max'] - model_stats['min']
discharge_range.plot(kind='bar', ax=axes[1,1], color='gold')
axes[1,1].set_title('Discharge Range')
axes[1,1].set_ylabel('Range (mm/day)')
axes[1,1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
Model Agreement Analysis
def analyze_model_agreement(results):
"""Analyze agreement between different models."""
# Create combined dataset
discharge_data = {}
for model_name, model_results in results.items():
discharge_data[model_name] = model_results['discharge'].values
combined_df = pd.DataFrame(discharge_data)
# Calculate correlation matrix
correlation_matrix = combined_df.corr()
# Plot correlation heatmap
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Correlation heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
ax=axes[0], square=True, cbar_kws={'label': 'Correlation'})
axes[0].set_title('Model Discharge Correlations')
# Model agreement (ensemble statistics)
ensemble_mean = combined_df.mean(axis=1)
ensemble_std = combined_df.std(axis=1)
# Plot ensemble statistics
time_index = pd.date_range('2010-01-01', periods=len(ensemble_mean), freq='D')
axes[1].fill_between(time_index,
ensemble_mean - ensemble_std,
ensemble_mean + ensemble_std,
alpha=0.3, label='±1 std')
axes[1].plot(time_index, ensemble_mean, 'k-', linewidth=2, label='Ensemble Mean')
# Plot individual models (subset for clarity)
subset_indices = slice(0, 365) # First year only
for model_name in ['HBV-96', 'GR4J']:
axes[1].plot(time_index[subset_indices],
combined_df[model_name].iloc[subset_indices],
alpha=0.7, label=model_name)
axes[1].set_title('Model Ensemble Analysis (First Year)')
axes[1].set_ylabel('Discharge (mm/day)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return correlation_matrix, ensemble_mean, ensemble_std
# Analyze model agreement
corr_matrix, ens_mean, ens_std = analyze_model_agreement(results)
print("Model Correlation Summary:")
print(f"Average inter-model correlation: {corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)].mean():.3f}")
print(f"Highest correlation: {corr_matrix.values.max():.3f}")
print(f"Lowest correlation: {corr_matrix.values[corr_matrix.values < 1].min():.3f}")
Step 6: Performance Assessment
def load_observed_data():
"""Load observed discharge data for validation."""
# Mock observed data for demonstration
dates = pd.date_range('2010-01-01', '2020-12-31', freq='D')
# Generate realistic observed discharge
base_obs = 2.0 + 0.5 * np.sin(2 * np.pi * np.arange(len(dates)) / 365.25)
noise = np.random.normal(0, 0.2, len(dates))
observed = np.maximum(0, base_obs + noise)
return pd.Series(observed, index=dates, name='observed_discharge')
def calculate_performance_metrics(observed, simulated):
"""Calculate hydrological performance metrics."""
# Ensure same length and no missing data
common_idx = observed.index.intersection(simulated.index)
obs = observed.loc[common_idx].dropna()
sim = simulated.loc[common_idx].dropna()
# Nash-Sutcliffe Efficiency
nse = 1 - np.sum((obs - sim)**2) / np.sum((obs - obs.mean())**2)
# Root Mean Square Error
rmse = np.sqrt(np.mean((obs - sim)**2))
# Percent Bias
pbias = 100 * np.sum(sim - obs) / np.sum(obs)
# Correlation coefficient
correlation = np.corrcoef(obs, sim)[0, 1]
# Kling-Gupta Efficiency
r = correlation
alpha = sim.std() / obs.std()
beta = sim.mean() / obs.mean()
kge = 1 - np.sqrt((r - 1)**2 + (alpha - 1)**2 + (beta - 1)**2)
return {
'NSE': nse,
'RMSE': rmse,
'PBIAS': pbias,
'R': correlation,
'KGE': kge
}
# Load observed data and calculate performance
observed = load_observed_data()
performance_results = {}
for model_name, model_results in results.items():
simulated = pd.Series(
model_results['discharge'].values,
index=model_results['discharge'].time.values,
name=f'{model_name}_discharge'
)
performance_results[model_name] = calculate_performance_metrics(observed, simulated)
# Create performance comparison table
performance_df = pd.DataFrame(performance_results).T
print("Model Performance Comparison:")
print(performance_df.round(3))
# Visualize performance metrics
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
metrics = ['NSE', 'RMSE', 'PBIAS', 'R', 'KGE']
colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold', 'plum']
for i, metric in enumerate(metrics):
performance_df[metric].plot(kind='bar', ax=axes[i], color=colors[i])
axes[i].set_title(f'{metric}')
axes[i].tick_params(axis='x', rotation=45)
axes[i].grid(True, alpha=0.3)
# Add reference lines for some metrics
if metric == 'NSE':
axes[i].axhline(y=0.5, color='red', linestyle='--', alpha=0.7, label='Good (0.5)')
axes[i].legend()
elif metric == 'PBIAS':
axes[i].axhline(y=0, color='red', linestyle='--', alpha=0.7, label='Perfect (0)')
axes[i].legend()
# Remove extra subplot
axes[-1].remove()
plt.tight_layout()
plt.show()
Step 7: Uncertainty Quantification
def quantify_model_uncertainty(results):
"""Quantify uncertainty across models."""
# Combine all model results
all_discharges = []
for model_name, model_results in results.items():
all_discharges.append(model_results['discharge'].values)
all_discharges = np.array(all_discharges)
# Calculate ensemble statistics
ensemble_mean = np.mean(all_discharges, axis=0)
ensemble_std = np.std(all_discharges, axis=0)
ensemble_min = np.min(all_discharges, axis=0)
ensemble_max = np.max(all_discharges, axis=0)
# Calculate uncertainty metrics
relative_uncertainty = ensemble_std / ensemble_mean * 100
spread = ensemble_max - ensemble_min
# Time series for plotting
time_index = results[list(results.keys())[0]]['discharge'].time.values
# Plot uncertainty analysis
fig, axes = plt.subplots(3, 1, figsize=(15, 12))
# Ensemble with uncertainty bounds
axes[0].fill_between(time_index[0:365],
(ensemble_mean - ensemble_std)[0:365],
(ensemble_mean + ensemble_std)[0:365],
alpha=0.3, color='gray', label='±1 std')
axes[0].fill_between(time_index[0:365],
ensemble_min[0:365],
ensemble_max[0:365],
alpha=0.2, color='red', label='Min-Max range')
axes[0].plot(time_index[0:365], ensemble_mean[0:365], 'k-', linewidth=2, label='Ensemble Mean')
axes[0].set_title('Model Uncertainty (First Year)')
axes[0].set_ylabel('Discharge (mm/day)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Relative uncertainty
axes[1].plot(time_index[0:365], relative_uncertainty[0:365])
axes[1].set_title('Relative Uncertainty (%)')
axes[1].set_ylabel('Uncertainty (%)')
axes[1].grid(True, alpha=0.3)
# Monthly uncertainty
monthly_uncertainty = []
month_names = []
for month in range(1, 13):
month_mask = pd.to_datetime(time_index).month == month
monthly_uncertainty.append(relative_uncertainty[month_mask].mean())
month_names.append(pd.to_datetime(f'2020-{month:02d}-01').strftime('%b'))
axes[2].bar(month_names, monthly_uncertainty, color='steelblue', alpha=0.7)
axes[2].set_title('Average Monthly Model Uncertainty')
axes[2].set_ylabel('Relative Uncertainty (%)')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Summary statistics
print("Uncertainty Summary:")
print(f"Mean relative uncertainty: {np.mean(relative_uncertainty):.1f}%")
print(f"Maximum relative uncertainty: {np.max(relative_uncertainty):.1f}%")
print(f"Average ensemble spread: {np.mean(spread):.2f} mm/day")
return {
'ensemble_mean': ensemble_mean,
'ensemble_std': ensemble_std,
'relative_uncertainty': relative_uncertainty,
'spread': spread
}
# Quantify uncertainty
uncertainty_results = quantify_model_uncertainty(results)
Step 8: Model Ranking and Selection
def rank_models(performance_df):
"""Rank models based on multiple performance criteria."""
# Define weights for different metrics (adjust based on study objectives)
weights = {
'NSE': 0.3,
'KGE': 0.3,
'R': 0.2,
'RMSE': -0.1, # Negative because lower is better
'PBIAS': -0.1 # Negative because closer to 0 is better (absolute value)
}
# Normalize metrics to 0-1 scale
normalized_metrics = performance_df.copy()
for metric in ['NSE', 'KGE', 'R']:
# Higher is better - normalize to 0-1
normalized_metrics[metric] = (performance_df[metric] - performance_df[metric].min()) / (performance_df[metric].max() - performance_df[metric].min())
for metric in ['RMSE']:
# Lower is better - invert and normalize
normalized_metrics[metric] = 1 - (performance_df[metric] - performance_df[metric].min()) / (performance_df[metric].max() - performance_df[metric].min())
for metric in ['PBIAS']:
# Closer to 0 is better
normalized_metrics[metric] = 1 - np.abs(performance_df[metric]) / np.abs(performance_df[metric]).max()
# Calculate weighted score
weighted_scores = {}
for model in performance_df.index:
score = sum(weights[metric] * normalized_metrics.loc[model, metric] for metric in weights.keys())
weighted_scores[model] = score
# Rank models
ranked_models = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)
print("Model Ranking (based on weighted performance):")
for i, (model, score) in enumerate(ranked_models, 1):
print(f"{i}. {model}: {score:.3f}")
return ranked_models, weighted_scores
# Rank models
model_ranking, scores = rank_models(performance_df)
# Visualize ranking
models = [item[0] for item in model_ranking]
scores_list = [item[1] for item in model_ranking]
plt.figure(figsize=(10, 6))
bars = plt.bar(models, scores_list, color=['gold', 'silver', '#CD7F32', 'lightgray'])
plt.title('Model Performance Ranking')
plt.ylabel('Weighted Performance Score')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
# Add score labels on bars
for bar, score in zip(bars, scores_list):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{score:.3f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
Complete Multi-Model Workflow Script
#!/usr/bin/env python3
"""
Complete multi-model comparison workflow for MarrmotFlow
"""
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from marrmotflow import MARRMOTWorkflow
def main():
# Configuration
models_to_compare = {"HBV-96": 7, "GR4J": 37, "Collie1": 1}
# Load data
print("Loading data...")
catchments = gpd.read_file("data/catchments.shp")
# Create workflows
workflows = {}
for model_name, model_number in models_to_compare.items():
print(f"Creating workflow for {model_name}...")
workflows[model_name] = MARRMOTWorkflow(
name=f"Comparison_{model_name}",
cat=catchments,
forcing_files="data/climate_data.nc",
forcing_vars={"precip": "precipitation", "temp": "temperature"},
model_number=model_number,
pet_method="penman_monteith"
)
print(f"Multi-model comparison setup complete!")
print(f"Ready to compare {len(workflows)} models across {len(catchments)} catchments")
# Execute workflows and analysis here...
if __name__ == "__main__":
main()
Next Steps
After completing this multi-model comparison:
Parameter sensitivity analysis: Examine how parameter uncertainty affects model agreement
Spatial analysis: Compare model performance across different catchment types
Climate change projections: Use ensemble results for climate impact assessment
Operational forecasting: Implement ensemble-based forecasting systems
Key Takeaways
Model diversity is valuable: Different models capture different aspects of hydrological processes
Ensemble approaches reduce uncertainty: Combining multiple models often provides more robust results
Performance varies by metric: Models may rank differently depending on evaluation criteria
Context matters: Model performance can vary by season, climate, and catchment characteristics