In [1]:

Copied!

# Uncomment to install climdata in Google Colab or other environments
# !pip install climdata
# Uncomment to install climdata in Google Colab or other environments
# !pip install climdata

ClimData Tutorial¶

This notebook demonstrates usage of the ClimData class for climate data extraction, extreme index computation, and workflow management. Includes examples for point-based and box-based extraction, variable exploration, and error handling.

1️⃣ Imports¶

In [2]:

Copied!





from climdata import ClimData
import pandas as pd
import xarray as xr

import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s | %(message)s",
    force=True,
)
from climdata import ClimData
import pandas as pd
import xarray as xr

import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s | %(message)s",
    force=True,
)

2️⃣ Explore available datasets¶

In [3]:

Copied!

extractor = ClimData()
datasets = extractor.get_datasets()
print(datasets)
extractor = ClimData()
datasets = extractor.get_datasets()
print(datasets)

['dwd', 'mswx', 'hyras', 'cmip', 'power', 'w5e5', 'cmip_w5e5', 'nexgddp']

3️⃣ Explore variables for a dataset¶

In [6]:

Copied!

variables = extractor.get_variables('w5e5')
print(variables)
variables = extractor.get_variables('w5e5')
print(variables)

['tas', 'tasmax', 'tasmin', 'pr', 'rsds', 'rlds', 'hurs', 'sfcWind', 'ps', 'huss']

In [9]:

Copied!





# for CMIP
import climdata
extractor_CMIP = climdata.CMIP(extractor.cfg)
print("Available Experiments (experiment_id)")
print("="*60)
print(extractor_CMIP.get_experiment_ids())
print("="*60)
print("Available CMIP6 Models (source_id)")
print("="*60)
print(extractor_CMIP.get_source_ids('ssp245'))
print("="*60)
print("Variables")
print("="*60)
print(extractor_CMIP.get_variables(experiment_id='ssp245',source_id='ACCESS-CM2'))
print("="*60)
# for CMIP
import climdata
extractor_CMIP = climdata.CMIP(extractor.cfg)
print("Available Experiments (experiment_id)")
print("="*60)
print(extractor_CMIP.get_experiment_ids())
print("="*60)
print("Available CMIP6 Models (source_id)")
print("="*60)
print(extractor_CMIP.get_source_ids('ssp245'))
print("="*60)
print("Variables")
print("="*60)
print(extractor_CMIP.get_variables(experiment_id='ssp245',source_id='ACCESS-CM2'))
print("="*60)

⚠️  Warning: Requested time range 1989-2020 extends beyond
   the typical Historical period (1850-2014).
   Data availability may be limited.
Available Experiments (experiment_id)
============================================================
['historical', 'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp434', 'ssp460', 'ssp585']
============================================================
Available CMIP6 Models (source_id)
============================================================

INFO | 46 models found for experiment 'ssp245'

['ACCESS-CM2', 'ACCESS-ESM1-5', 'AWI-CM-1-1-MR', 'BCC-CSM2-MR', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'CESM2', 'CESM2-WACCM', 'CIESM', 'CMCC-CM2-SR5', 'CMCC-ESM2', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'CanESM5', 'CanESM5-CanOE', 'E3SM-1-1', 'EC-Earth3', 'EC-Earth3-CC', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'FGOALS-g3', 'FIO-ESM-2-0', 'GFDL-CM4', 'GFDL-ESM4', 'GISS-E2-1-G', 'GISS-E2-1-H', 'HadGEM3-GC31-LL', 'IITM-ESM', 'INM-CM4-8', 'INM-CM5-0', 'IPSL-CM6A-LR', 'KACE-1-0-G', 'KIOST-ESM', 'MCM-UA-1-0', 'MIROC-ES2L', 'MIROC6', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0', 'NESM3', 'NorESM2-LM', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL']
============================================================
Variables
============================================================
['hurs', 'pr', 'sfcWind', 'tas', 'tasmax', 'tasmin']
============================================================

4️⃣ Explore metadata for a variable¶

In [4]:

Copied!





variables = extractor.get_variables('w5e5')
print(variables)
print("*"*70)
varinfo = extractor.get_varinfo('rlds')
print(varinfo)
variables = extractor.get_variables('w5e5')
print(variables)
print("*"*70)
varinfo = extractor.get_varinfo('rlds')
print(varinfo)

['tas', 'tasmax', 'tasmin', 'pr', 'rsds', 'rlds', 'hurs', 'sfcWind', 'ps', 'huss']
**********************************************************************
{'cf_name': 'surface_downwelling_longwave_flux_in_air', 'long_name': 'Surface downwelling longwave radiation', 'units': 'W m-2'}

5️⃣ Explore available workflow actions¶

In [13]:

Copied!

actions = extractor.get_actions()
print(actions.keys())
actions = extractor.get_actions()
print(actions.keys())

dict_keys(['extract', 'calc_index', 'impute', 'to_nc', 'to_csv', 'upload_netcdf', 'upload_csv'])

In [6]:

Copied!

indices = extractor.get_indices(['tasmin', 'tasmax'])
print(indices.keys())

impute_methods = extractor.get_impute_methods()
print(impute_methods.keys())
indices = extractor.get_indices(['tasmin', 'tasmax'])
print(indices.keys())

impute_methods = extractor.get_impute_methods()
print(impute_methods.keys())

dict_keys(['heat_wave_index', 'heat_wave_frequency', 'heat_wave_max_length', 'heat_wave_total_length', 'hot_spell_frequency', 'hot_spell_max_length', 'hot_spell_total_length', 'hot_spell_max_magnitude', 'ice_days', 'isothermality', 'maximum_consecutive_frost_days', 'maximum_consecutive_frost_free_days', 'maximum_consecutive_tx_days'])
dict_keys(['BRITS', 'XGBOOST', 'CDRec', 'SoftImpute'])

6️⃣ Point extraction workflow¶

In [ ]:

Copied!





import json

# -----------------------------
# Step 1: Define the area of interest (AOI)
# -----------------------------
# The AOI is a single point. In GeoJSON format, the coordinates are [longitude, latitude].
geojson = {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {},
      "geometry": {
        "coordinates": [
          24.246667038198012,  # longitude
          12.891982026993958   # latitude
        ],
        "type": "Point"
      }
    }
  ]
}


# -----------------------------
# Step 2: Define configuration overrides
# -----------------------------
# Overrides are strings used by Hydra to modify default configurations at runtime.
overrides = [
    "dataset=cmip",  # Choose the MSWX dataset for extraction
    f"aoi='{json.dumps(geojson)}'",  # Set the AOI as the point defined above
    f"time_range.start_date=2004-01-01",  # Start date for data extraction
    f"time_range.end_date=2014-12-31",    # End date for data extraction
    "variables=[tasmin,tasmax,pr]",       # Variables to extract: min/max temp and precipitation
    "data_dir=./data",    # Local directory to store raw/intermediate files
    # "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",  # optional . required for MSWS data download
    "index=tn10p",  # Climate extreme index to calculate
    "impute=BRITS"
]

# -----------------------------
# Step 3: Define the workflow sequence
# -----------------------------
seq = ["extract", "impute", "calc_index", "to_nc"]

# -----------------------------
# Step 4: Initialize the ClimData extractor
# -----------------------------
extractor = ClimData(overrides=overrides)

# -----------------------------
# Step 5: Run the Multi-Step workflow
# -----------------------------
result = extractor.run_workflow(
    actions=seq,
)
import json

# -----------------------------
# Step 1: Define the area of interest (AOI)
# -----------------------------
# The AOI is a single point. In GeoJSON format, the coordinates are [longitude, latitude].
geojson = {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {},
      "geometry": {
        "coordinates": [
          24.246667038198012,  # longitude
          12.891982026993958   # latitude
        ],
        "type": "Point"
      }
    }
  ]
}


# -----------------------------
# Step 2: Define configuration overrides
# -----------------------------
# Overrides are strings used by Hydra to modify default configurations at runtime.
overrides = [
    "dataset=cmip",  # Choose the MSWX dataset for extraction
    f"aoi='{json.dumps(geojson)}'",  # Set the AOI as the point defined above
    f"time_range.start_date=2004-01-01",  # Start date for data extraction
    f"time_range.end_date=2014-12-31",    # End date for data extraction
    "variables=[tasmin,tasmax,pr]",       # Variables to extract: min/max temp and precipitation
    "data_dir=./data",    # Local directory to store raw/intermediate files
    # "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",  # optional . required for MSWS data download
    "index=tn10p",  # Climate extreme index to calculate
    "impute=BRITS"
]

# -----------------------------
# Step 3: Define the workflow sequence
# -----------------------------
seq = ["extract", "impute", "calc_index", "to_nc"]

# -----------------------------
# Step 4: Initialize the ClimData extractor
# -----------------------------
extractor = ClimData(overrides=overrides)

# -----------------------------
# Step 5: Run the Multi-Step workflow
# -----------------------------
result = extractor.run_workflow(
    actions=seq,
)

INFO | Starting action: extract
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
/home/muduchuru/miniforge3/envs/sdba/lib/python3.10/site-packages/intake_esm/core.py:475: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  cat = self.__class__({'esmcat': self.esmcat.dict(), 'df': esmcat_results})
INFO | Completed action: extract
INFO | Starting action: impute
INFO | No missing data found. Imputation not required.
INFO | Completed action: impute
INFO | Starting action: calc_index
/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py:632: UserWarning: Index tn10p usually requires ≥30 years, got 11
  warnings.warn(f"Index {cfg.index} usually requires ≥30 years, got {n_years}", UserWarning)
INFO | Completed action: calc_index
INFO | Starting action: to_nc
<frozen importlib._bootstrap>:241: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility. Expected 16 from C header, got 96 from PyObject
INFO | Dataset saved to NetCDF file: cmip_tn10p_LAT12.891982026993958_LON24.246667038198012_2004-01-01_2014-12-31.nc
INFO | Completed action: to_nc

In [8]:

Copied!





import json

# -----------------------------
# Define the area of interest (AOI)
# -----------------------------
# This AOI is a single point with latitude 12.891982026993958 and longitude 24.246667038198012
geojson = {
    "type": "FeatureCollection",
    "features": [
        {
            "type": "Feature",
            "properties": {},
            "geometry": {
                "coordinates": [24.246667038198012, 12.891982026993958],
                "type": "Point"
            }
        }
    ]
}

# -----------------------------
# Define configuration overrides
# -----------------------------
# These strings override the default hydra config at runtime
overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    f"aoi='{json.dumps(geojson)}'",  # Set AOI as the point defined above
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "variables=[tasmin,tasmax,pr]",       # Variables to extract: min/max temperature & precipitation
    "data_dir=./data",    # Local directory to store downloaded/intermediate files
    # Optional Google service account if needed for MSWX access
    # "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",
    "index=tn10p",  # Extreme climate index to calculate
]

# -----------------------------
# Initialize the ClimData extractor
# -----------------------------
# This loads the configuration with overrides and prepares the object
extractor = ClimData(overrides=overrides)

# -----------------------------
# Extract climate data
# -----------------------------
# Returns an xarray.Dataset for the selected variables, AOI, and time range
ds = extractor.extract()

# -----------------------------
# Compute the climate index
# -----------------------------
# Takes the extracted dataset and calculates the extreme index "tn10p"
# Returns a new xarray.Dataset containing only the index
ds_index = extractor.calc_index(ds)

# -----------------------------
# Convert the index dataset to a long-form pandas DataFrame
# -----------------------------
# Each row corresponds to a time, lat, lon, and variable (here just "tn10p")
df_index = extractor.to_dataframe(ds_index)

# -----------------------------
# Save the DataFrame to CSV
# -----------------------------
# This will write the index values to "index.csv" in the current working directory
extractor.to_csv(df_index, filename="index.csv")
import json

# -----------------------------
# Define the area of interest (AOI)
# -----------------------------
# This AOI is a single point with latitude 12.891982026993958 and longitude 24.246667038198012
geojson = {
    "type": "FeatureCollection",
    "features": [
        {
            "type": "Feature",
            "properties": {},
            "geometry": {
                "coordinates": [24.246667038198012, 12.891982026993958],
                "type": "Point"
            }
        }
    ]
}

# -----------------------------
# Define configuration overrides
# -----------------------------
# These strings override the default hydra config at runtime
overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    f"aoi='{json.dumps(geojson)}'",  # Set AOI as the point defined above
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "variables=[tasmin,tasmax,pr]",       # Variables to extract: min/max temperature & precipitation
    "data_dir=./data",    # Local directory to store downloaded/intermediate files
    # Optional Google service account if needed for MSWX access
    # "dsinfo.mswx.params.google_service_account=./.climdata_conf/service.json",
    "index=tn10p",  # Extreme climate index to calculate
]

# -----------------------------
# Initialize the ClimData extractor
# -----------------------------
# This loads the configuration with overrides and prepares the object
extractor = ClimData(overrides=overrides)

# -----------------------------
# Extract climate data
# -----------------------------
# Returns an xarray.Dataset for the selected variables, AOI, and time range
ds = extractor.extract()

# -----------------------------
# Compute the climate index
# -----------------------------
# Takes the extracted dataset and calculates the extreme index "tn10p"
# Returns a new xarray.Dataset containing only the index
ds_index = extractor.calc_index(ds)

# -----------------------------
# Convert the index dataset to a long-form pandas DataFrame
# -----------------------------
# Each row corresponds to a time, lat, lon, and variable (here just "tn10p")
df_index = extractor.to_dataframe(ds_index)

# -----------------------------
# Save the DataFrame to CSV
# -----------------------------
# This will write the index values to "index.csv" in the current working directory
extractor.to_csv(df_index, filename="index.csv")

✅ All 31 tasmin files already exist locally.
✅ All 31 tasmax files already exist locally.
✅ All 31 pr files already exist locally.

/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py:632: UserWarning: Index tn10p usually requires ≥30 years, got 1
  warnings.warn(f"Index {cfg.index} usually requires ≥30 years, got {n_years}", UserWarning)
INFO | DataFrame saved to CSV file: index.csv

Out[8]:

'index.csv'

Output filenames¶

In [9]:

Copied!

print(extractor.current_filename)
# print(extractor_point.filename_nc)
print(extractor.current_filename)
# print(extractor_point.filename_nc)

index.csv

7️⃣ Box extraction workflow¶

In [10]:

Copied!





box_overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    "region=europe", # Select the region
    "variables=[tasmin,tasmax]",
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "data_dir=./data",    # Local directory to store downloaded/intermediate files
]

extractor_box = ClimData(overrides=box_overrides)
result_box = extractor_box.run_workflow(actions=["extract", "to_csv"])
box_overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    "region=europe", # Select the region
    "variables=[tasmin,tasmax]",
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "data_dir=./data",    # Local directory to store downloaded/intermediate files
]

extractor_box = ClimData(overrides=box_overrides)
result_box = extractor_box.run_workflow(actions=["extract", "to_csv"])

INFO | Starting action: extract

✅ All 31 tasmin files already exist locally.
✅ All 31 tasmax files already exist locally.

INFO | Completed action: extract
INFO | Starting action: to_csv
INFO | DataFrame saved to CSV file: mswx_tasmin_tasmax_LAT_34.0_71.0_LON_-25.0_45.0_2014-12-01_2014-12-31.csv
INFO | Completed action: to_csv

8️⃣ Compute extreme index only¶

In [11]:

Copied!





lat_berlin, lon_berlin = [52.5,13.4]
idx_overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    f"lat={lat_berlin}", # Select the region
    f"lon={lon_berlin}",
    "variables=[tasmin,tasmax]",
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "data_dir=./data",    # Local directory to store downloaded/intermediate files
    "index=heat_wave_max_length"
]


extractor_idx = ClimData(overrides=idx_overrides)
result_idx = extractor_idx.run_workflow(actions=["extract", "calc_index", "to_csv"])
result_idx.dataframe.head()
lat_berlin, lon_berlin = [52.5,13.4]
idx_overrides = [
    "dataset=mswx",  # Select the MSWX dataset for extraction
    f"lat={lat_berlin}", # Select the region
    f"lon={lon_berlin}",
    "variables=[tasmin,tasmax]",
    f"time_range.start_date=2014-12-01",  # Start date of extraction
    f"time_range.end_date=2014-12-31",    # End date of extraction
    "data_dir=./data",    # Local directory to store downloaded/intermediate files
    "index=heat_wave_max_length"
]


extractor_idx = ClimData(overrides=idx_overrides)
result_idx = extractor_idx.run_workflow(actions=["extract", "calc_index", "to_csv"])
result_idx.dataframe.head()

INFO | Starting action: extract

✅ All 31 tasmin files already exist locally.
✅ All 31 tasmax files already exist locally.

INFO | Completed action: extract
INFO | Starting action: calc_index
/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py:632: UserWarning: Index heat_wave_max_length usually requires ≥30 years, got 1
  warnings.warn(f"Index {cfg.index} usually requires ≥30 years, got {n_years}", UserWarning)
INFO | Completed action: calc_index
INFO | Starting action: to_csv
INFO | DataFrame saved to CSV file: mswx_heat_wave_max_length_LAT_52.5_LON_13.4_2014-12-01_2014-12-31.csv
INFO | Completed action: to_csv

Out[11]:

	time	lat	lon	variable	value	units	source
0	2014-01-01	52.549999	13.350003	heat_wave_max_length	0.0	d	mswx

9️⃣ Error examples¶

In [12]:

Copied!





try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["calc_index"])
except Exception as e:
    print("Error:", e)

try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["to_csv"])
except Exception as e:
    print("Error:", e)

try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["upload_netcdf"])
except Exception as e:
    print("Error:", e)
try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["calc_index"])
except Exception as e:
    print("Error:", e)

try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["to_csv"])
except Exception as e:
    print("Error:", e)

try:
    bad_ex = ClimData()
    bad_ex.run_workflow(actions=["upload_netcdf"])
except Exception as e:
    print("Error:", e)

INFO | Starting action: calc_index
ERROR | Action 'calc_index' failed
Traceback (most recent call last):
  File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 829, in run_workflow
    raise ValueError(
ValueError: Action 'calc_index' requires a dataset, but no dataset is available. Upload or extract a dataset before computing an index.

Error: Action 'calc_index' requires a dataset, but no dataset is available. Upload or extract a dataset before computing an index.

INFO | Starting action: to_csv
ERROR | Action 'to_csv' failed
Traceback (most recent call last):
  File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 838, in run_workflow
    raise ValueError(
ValueError: Action 'to_dataframe' requires a dataset, but no dataset is available. Upload or extract a dataset before converting to a DataFrame.

Error: Action 'to_dataframe' requires a dataset, but no dataset is available. Upload or extract a dataset before converting to a DataFrame.

INFO | Starting action: upload_netcdf
ERROR | Action 'upload_netcdf' failed
Traceback (most recent call last):
  File "/beegfs/muduchuru/pkgs_fnl/climdata/climdata/utils/wrapper_workflow.py", line 786, in run_workflow
    raise ValueError(
ValueError: Action 'upload_netcdf' requires argument 'netcdf_file', but none was provided.

Error: Action 'upload_netcdf' requires argument 'netcdf_file', but none was provided.