In [1]:
Copied!
# Uncomment to install climdata in Google Colab or other environments
# !pip install climdata
# Uncomment to install climdata in Google Colab or other environments
# !pip install climdata
ClimData Tutorial¶
This notebook demonstrates usage of the ClimData class for climate data extraction, extreme index computation, and workflow management.
Includes examples for point-based and box-based extraction, variable exploration, and error handling.
1️⃣ Imports¶
In [2]:
Copied!
from climdata import ClimData
import pandas as pd
import xarray as xr
import logging
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s | %(message)s",
force=True,
)
from climdata import ClimData
import pandas as pd
import xarray as xr
import logging
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s | %(message)s",
force=True,
)
2️⃣ Explore available datasets¶
In [3]:
Copied!
extractor = ClimData()
datasets = extractor.get_datasets()
print(datasets)
extractor = ClimData()
datasets = extractor.get_datasets()
print(datasets)
['dwd', 'mswx', 'hyras', 'cmip', 'power', 'w5e5', 'cmip_w5e5', 'nexgddp']
3️⃣ Explore variables for a dataset¶
In [6]:
Copied!
variables = extractor.get_variables('w5e5')
print(variables)
variables = extractor.get_variables('w5e5')
print(variables)
['tas', 'tasmax', 'tasmin', 'pr', 'rsds', 'rlds', 'hurs', 'sfcWind', 'ps', 'huss']
In [9]:
Copied!
# for CMIP
import climdata
extractor_CMIP = climdata.CMIP(extractor.cfg)
print("Available Experiments (experiment_id)")
print("="*60)
print(extractor_CMIP.get_experiment_ids())
print("="*60)
print("Available CMIP6 Models (source_id)")
print("="*60)
print(extractor_CMIP.get_source_ids('ssp245'))
print("="*60)
print("Variables")
print("="*60)
print(extractor_CMIP.get_variables(experiment_id='ssp245',source_id='ACCESS-CM2'))
print("="*60)
# for CMIP
import climdata
extractor_CMIP = climdata.CMIP(extractor.cfg)
print("Available Experiments (experiment_id)")
print("="*60)
print(extractor_CMIP.get_experiment_ids())
print("="*60)
print("Available CMIP6 Models (source_id)")
print("="*60)
print(extractor_CMIP.get_source_ids('ssp245'))
print("="*60)
print("Variables")
print("="*60)
print(extractor_CMIP.get_variables(experiment_id='ssp245',source_id='ACCESS-CM2'))
print("="*60)
⚠️ Warning: Requested time range 1989-2020 extends beyond the typical Historical period (1850-2014). Data availability may be limited. Available Experiments (experiment_id) ============================================================ ['historical', 'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp434', 'ssp460', 'ssp585'] ============================================================ Available CMIP6 Models (source_id) ============================================================
INFO | 46 models found for experiment 'ssp245'
['ACCESS-CM2', 'ACCESS-ESM1-5', 'AWI-CM-1-1-MR', 'BCC-CSM2-MR', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'CESM2', 'CESM2-WACCM', 'CIESM', 'CMCC-CM2-SR5', 'CMCC-ESM2', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'CanESM5', 'CanESM5-CanOE', 'E3SM-1-1', 'EC-Earth3', 'EC-Earth3-CC', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'FGOALS-g3', 'FIO-ESM-2-0', 'GFDL-CM4', 'GFDL-ESM4', 'GISS-E2-1-G', 'GISS-E2-1-H', 'HadGEM3-GC31-LL', 'IITM-ESM', 'INM-CM4-8', 'INM-CM5-0', 'IPSL-CM6A-LR', 'KACE-1-0-G', 'KIOST-ESM', 'MCM-UA-1-0', 'MIROC-ES2L', 'MIROC6', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0', 'NESM3', 'NorESM2-LM', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL'] ============================================================ Variables ============================================================ ['hurs', 'pr', 'sfcWind', 'tas', 'tasmax', 'tasmin'] ============================================================
4️⃣ Explore metadata for a variable¶
In [4]:
Copied!
variables = extractor.get_variables('w5e5')
print(variables)
print("*"*70)
varinfo = extractor.get_varinfo('rlds')
print(varinfo)
variables = extractor.get_variables('w5e5')
print(variables)
print("*"*70)
varinfo = extractor.get_varinfo('rlds')
print(varinfo)
['tas', 'tasmax', 'tasmin', 'pr', 'rsds', 'rlds', 'hurs', 'sfcWind', 'ps', 'huss']
**********************************************************************
{'cf_name': 'surface_downwelling_longwave_flux_in_air', 'long_name': 'Surface downwelling longwave radiation', 'units': 'W m-2'}
5️⃣ Explore available workflow actions¶
In [13]:
Copied!
actions = extractor.get_actions()
print(actions.keys())
actions = extractor.get_actions()
print(actions.keys())
dict_keys(['extract', 'calc_index', 'impute', 'to_nc', 'to_csv', 'upload_netcdf', 'upload_csv'])
In [6]:
Copied!
indices = extractor.get_indices(['tasmin', 'tasmax'])
print(indices.keys())
impute_methods = extractor.get_impute_methods()
print(impute_methods.keys())
indices = extractor.get_indices(['tasmin', 'tasmax'])
print(indices.keys())
impute_methods = extractor.get_impute_methods()
print(impute_methods.keys())
dict_keys(['heat_wave_index', 'heat_wave_frequency', 'heat_wave_max_length', 'heat_wave_total_length', 'hot_spell_frequency', 'hot_spell_max_length', 'hot_spell_total_length', 'hot_spell_max_magnitude', 'ice_days', 'isothermality', 'maximum_consecutive_frost_days', 'maximum_consecutive_frost_free_days', 'maximum_consecutive_tx_days']) dict_keys(['BRITS', 'XGBOOST', 'CDRec', 'SoftImpute'])
6️⃣ Point extraction workflow¶
In [ ]:
Copied!
import json
# -----------------------------
# Step 1: Define the area of interest (AOI)
# -----------------------------
# The AOI is a single point. In GeoJSON format, the coordinates are [longitude, latitude].
geojson = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {},
"geometry": {
"coordinates": [
24.246667038198012, # longitude
12.891982026993958 # latitude
],
"type": "Point"
}
}
]
}
# -----------------------------
# Step 2: Define configuration overrides
# -----------------------------
# Overrides are strings used by Hydra to modify default configurations at runtime.
overrides = [
"dataset=cmip", # Choose the MSWX dataset for extraction
f"aoi='{json.dumps(geojson)}'", # Set the AOI as the point defined above
f"time_range.start_date=2004-01-01", # Start date for data extraction
f"time_range.end_date=2014-12-31", # End date for data extraction
"variables=[tasmin,tasmax,pr]", # Variables to extract: min/max temp and precipitation
"data_dir=./data", # Local directory to store raw/intermediate files
# "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json", # optional . required for MSWS data download
"index=tn10p", # Climate extreme index to calculate
"impute=BRITS"
]
# -----------------------------
# Step 3: Define the workflow sequence
# -----------------------------
seq = ["extract", "impute", "calc_index", "to_nc"]
# -----------------------------
# Step 4: Initialize the ClimData extractor
# -----------------------------
extractor = ClimData(overrides=overrides)
# -----------------------------
# Step 5: Run the Multi-Step workflow
# -----------------------------
result = extractor.run_workflow(
actions=seq,
)
import json
# -----------------------------
# Step 1: Define the area of interest (AOI)
# -----------------------------
# The AOI is a single point. In GeoJSON format, the coordinates are [longitude, latitude].
geojson = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {},
"geometry": {
"coordinates": [
24.246667038198012, # longitude
12.891982026993958 # latitude
],
"type": "Point"
}
}
]
}
# -----------------------------
# Step 2: Define configuration overrides
# -----------------------------
# Overrides are strings used by Hydra to modify default configurations at runtime.
overrides = [
"dataset=cmip", # Choose the MSWX dataset for extraction
f"aoi='{json.dumps(geojson)}'", # Set the AOI as the point defined above
f"time_range.start_date=2004-01-01", # Start date for data extraction
f"time_range.end_date=2014-12-31", # End date for data extraction
"variables=[tasmin,tasmax,pr]", # Variables to extract: min/max temp and precipitation
"data_dir=./data", # Local directory to store raw/intermediate files
# "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json", # optional . required for MSWS data download
"index=tn10p", # Climate extreme index to calculate
"impute=BRITS"
]
# -----------------------------
# Step 3: Define the workflow sequence
# -----------------------------
seq = ["extract", "impute", "calc_index", "to_nc"]
# -----------------------------
# Step 4: Initialize the ClimData extractor
# -----------------------------
extractor = ClimData(overrides=overrides)
# -----------------------------
# Step 5: Run the Multi-Step workflow
# -----------------------------
result = extractor.run_workflow(
actions=seq,
)
INFO | Starting action: extract
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
INFO | Completed action: extract
INFO | Starting action: impute
INFO | No missing data found. Imputation not required.
INFO | Completed action: impute
INFO | Starting action: calc_index
/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py:632: UserWarning: Index tn10p usually requires ≥30 years, got 11
warnings.warn(f"Index {cfg.index} usually requires ≥30 years, got {n_years}", UserWarning)
INFO | Completed action: calc_index
INFO | Starting action: to_nc
<frozen importlib._bootstrap>:241: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility. Expected 16 from C header, got 96 from PyObject
INFO | Dataset saved to NetCDF file: cmip_tn10p_LAT12.891982026993958_LON24.246667038198012_2004-01-01_2014-12-31.nc
INFO | Completed action: to_nc
In [8]:
Copied!
import json
# -----------------------------
# Define the area of interest (AOI)
# -----------------------------
# This AOI is a single point with latitude 12.891982026993958 and longitude 24.246667038198012
geojson = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {},
"geometry": {
"coordinates": [24.246667038198012, 12.891982026993958],
"type": "Point"
}
}
]
}
# -----------------------------
# Define configuration overrides
# -----------------------------
# These strings override the default hydra config at runtime
overrides = [
"dataset=mswx", # Select the MSWX dataset for extraction
f"aoi='{json.dumps(geojson)}'", # Set AOI as the point defined above
f"time_range.start_date=2014-12-01", # Start date of extraction
f"time_range.end_date=2014-12-31", # End date of extraction
"variables=[tasmin,tasmax,pr]", # Variables to extract: min/max temperature & precipitation
"data_dir=./data", # Local directory to store downloaded/intermediate files
# Optional Google service account if needed for MSWX access
# "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",
"index=tn10p", # Extreme climate index to calculate
]
# -----------------------------
# Initialize the ClimData extractor
# -----------------------------
# This loads the configuration with overrides and prepares the object
extractor = ClimData(overrides=overrides)
# -----------------------------
# Extract climate data
# -----------------------------
# Returns an xarray.Dataset for the selected variables, AOI, and time range
ds = extractor.extract()
# -----------------------------
# Compute the climate index
# -----------------------------
# Takes the extracted dataset and calculates the extreme index "tn10p"
# Returns a new xarray.Dataset containing only the index
ds_index = extractor.calc_index(ds)
# -----------------------------
# Convert the index dataset to a long-form pandas DataFrame
# -----------------------------
# Each row corresponds to a time, lat, lon, and variable (here just "tn10p")
df_index = extractor.to_dataframe(ds_index)
# -----------------------------
# Save the DataFrame to CSV
# -----------------------------
# This will write the index values to "index.csv" in the current working directory
extractor.to_csv(df_index, filename="index.csv")
import json
# -----------------------------
# Define the area of interest (AOI)
# -----------------------------
# This AOI is a single point with latitude 12.891982026993958 and longitude 24.246667038198012
geojson = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {},
"geometry": {
"coordinates": [24.246667038198012, 12.891982026993958],
"type": "Point"
}
}
]
}
# -----------------------------
# Define configuration overrides
# -----------------------------
# These strings override the default hydra config at runtime
overrides = [
"dataset=mswx", # Select the MSWX dataset for extraction
f"aoi='{json.dumps(geojson)}'", # Set AOI as the point defined above
f"time_range.start_date=2014-12-01", # Start date of extraction
f"time_range.end_date=2014-12-31", # End date of extraction
"variables=[tasmin,tasmax,pr]", # Variables to extract: min/max temperature & precipitation
"data_dir=./data", # Local directory to store downloaded/intermediate files
# Optional Google service account if needed for MSWX access
# "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",
"index=tn10p", # Extreme climate index to calculate
]
# -----------------------------
# Initialize the ClimData extractor
# -----------------------------
# This loads the configuration with overrides and prepares the object
extractor = ClimData(overrides=overrides)
# -----------------------------
# Extract climate data
# -----------------------------
# Returns an xarray.Dataset for the selected variables, AOI, and time range
ds = extractor.extract()
# -----------------------------
# Compute the climate index
# -----------------------------
# Takes the extracted dataset and calculates the extreme index "tn10p"
# Returns a new xarray.Dataset containing only the index
ds_index = extractor.calc_index(ds)
# -----------------------------
# Convert the index dataset to a long-form pandas DataFrame
# -----------------------------
# Each row corresponds to a time, lat, lon, and variable (here just "tn10p")
df_index = extractor.to_dataframe(ds_index)
# -----------------------------
# Save the DataFrame to CSV
# -----------------------------
# This will write the index values to "index.csv" in the current working directory
extractor.to_csv(df_index, filename="index.csv")
✅ All 31 tasmin files already exist locally. ✅ All 31 tasmax files already exist locally. ✅ All 31 pr files already exist locally.
/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py:632: UserWarning: Index tn10p usually requires ≥30 years, got 1
warnings.warn(f"Index {cfg.index} usually requires ≥30 years, got {n_years}", UserWarning)
INFO | DataFrame saved to CSV file: index.csv
Out[8]:
'index.csv'
Output filenames¶
In [9]:
Copied!
print(extractor.current_filename)
# print(extractor_point.filename_nc)
print(extractor.current_filename)
# print(extractor_point.filename_nc)
index.csv
7️⃣ Box extraction workflow¶
In [10]:
Copied!
box_overrides = [
"dataset=mswx", # Select the MSWX dataset for extraction
"region=europe", # Select the region
"variables=[tasmin,tasmax]",
f"time_range.start_date=2014-12-01", # Start date of extraction
f"time_range.end_date=2014-12-31", # End date of extraction
"data_dir=./data", # Local directory to store downloaded/intermediate files
]
extractor_box = ClimData(overrides=box_overrides)
result_box = extractor_box.run_workflow(actions=["extract", "to_csv"])
box_overrides = [
"dataset=mswx", # Select the MSWX dataset for extraction
"region=europe", # Select the region
"variables=[tasmin,tasmax]",
f"time_range.start_date=2014-12-01", # Start date of extraction
f"time_range.end_date=2014-12-31", # End date of extraction
"data_dir=./data", # Local directory to store downloaded/intermediate files
]
extractor_box = ClimData(overrides=box_overrides)
result_box = extractor_box.run_workflow(actions=["extract", "to_csv"])
INFO | Starting action: extract
✅ All 31 tasmin files already exist locally. ✅ All 31 tasmax files already exist locally.
INFO | Completed action: extract INFO | Starting action: to_csv INFO | DataFrame saved to CSV file: mswx_tasmin_tasmax_LAT_34.0_71.0_LON_-25.0_45.0_2014-12-01_2014-12-31.csv INFO | Completed action: to_csv
8️⃣ Compute extreme index only¶
In [11]:
Copied!
lat_berlin, lon_berlin = [52.5,13.4]
idx_overrides = [
"dataset=mswx", # Select the MSWX dataset for extraction
f"lat={lat_berlin}", # Select the region
f"lon={lon_berlin}",
"variables=[tasmin,tasmax]",
f"time_range.start_date=2014-12-01", # Start date of extraction
f"time_range.end_date=2014-12-31", # End date of extraction
"data_dir=./data", # Local directory to store downloaded/intermediate files
"index=heat_wave_max_length"
]
extractor_idx = ClimData(overrides=idx_overrides)
result_idx = extractor_idx.run_workflow(actions=["extract", "calc_index", "to_csv"])
result_idx.dataframe.head()
lat_berlin, lon_berlin = [52.5,13.4]
idx_overrides = [
"dataset=mswx", # Select the MSWX dataset for extraction
f"lat={lat_berlin}", # Select the region
f"lon={lon_berlin}",
"variables=[tasmin,tasmax]",
f"time_range.start_date=2014-12-01", # Start date of extraction
f"time_range.end_date=2014-12-31", # End date of extraction
"data_dir=./data", # Local directory to store downloaded/intermediate files
"index=heat_wave_max_length"
]
extractor_idx = ClimData(overrides=idx_overrides)
result_idx = extractor_idx.run_workflow(actions=["extract", "calc_index", "to_csv"])
result_idx.dataframe.head()
INFO | Starting action: extract
✅ All 31 tasmin files already exist locally. ✅ All 31 tasmax files already exist locally.
INFO | Completed action: extract
INFO | Starting action: calc_index
/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py:632: UserWarning: Index heat_wave_max_length usually requires ≥30 years, got 1
warnings.warn(f"Index {cfg.index} usually requires ≥30 years, got {n_years}", UserWarning)
INFO | Completed action: calc_index
INFO | Starting action: to_csv
INFO | DataFrame saved to CSV file: mswx_heat_wave_max_length_LAT_52.5_LON_13.4_2014-12-01_2014-12-31.csv
INFO | Completed action: to_csv
Out[11]:
| time | lat | lon | variable | value | units | source | |
|---|---|---|---|---|---|---|---|
| 0 | 2014-01-01 | 52.549999 | 13.350003 | heat_wave_max_length | 0.0 | d | mswx |
9️⃣ Error examples¶
In [12]:
Copied!
try:
bad_ex = ClimData()
bad_ex.run_workflow(actions=["calc_index"])
except Exception as e:
print("Error:", e)
try:
bad_ex = ClimData()
bad_ex.run_workflow(actions=["to_csv"])
except Exception as e:
print("Error:", e)
try:
bad_ex = ClimData()
bad_ex.run_workflow(actions=["upload_netcdf"])
except Exception as e:
print("Error:", e)
try:
bad_ex = ClimData()
bad_ex.run_workflow(actions=["calc_index"])
except Exception as e:
print("Error:", e)
try:
bad_ex = ClimData()
bad_ex.run_workflow(actions=["to_csv"])
except Exception as e:
print("Error:", e)
try:
bad_ex = ClimData()
bad_ex.run_workflow(actions=["upload_netcdf"])
except Exception as e:
print("Error:", e)
INFO | Starting action: calc_index
ERROR | Action 'calc_index' failed
Traceback (most recent call last):
File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 829, in run_workflow
raise ValueError(
ValueError: Action 'calc_index' requires a dataset, but no dataset is available. Upload or extract a dataset before computing an index.
Error: Action 'calc_index' requires a dataset, but no dataset is available. Upload or extract a dataset before computing an index.
INFO | Starting action: to_csv
ERROR | Action 'to_csv' failed
Traceback (most recent call last):
File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 838, in run_workflow
raise ValueError(
ValueError: Action 'to_dataframe' requires a dataset, but no dataset is available. Upload or extract a dataset before converting to a DataFrame.
Error: Action 'to_dataframe' requires a dataset, but no dataset is available. Upload or extract a dataset before converting to a DataFrame.
INFO | Starting action: upload_netcdf
ERROR | Action 'upload_netcdf' failed
Traceback (most recent call last):
File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 786, in run_workflow
raise ValueError(
ValueError: Action 'upload_netcdf' requires argument 'netcdf_file', but none was provided.
Error: Action 'upload_netcdf' requires argument 'netcdf_file', but none was provided.