Create a dataset for benchmarking#
Combine three datasets:
The processed OCR burn probability data
The original burn probability data from Riley et al. (to create a “non-burnable” mask)
Historical fire perimeters
Note, this script has not been optimized for performance. It requires at least 400GB of memory (I used m8g.48xlarge)
import gc
import os
import geopandas as gpd
import icechunk
import numpy as np
import rasterio.features
import xarray as xr
# import seaborn as sns # plotting
from ocr import catalog # for the riley data
Read in and pre-process the data#
OCR burn probability#
version = '1.1.0'
setup = 'production'
storage = icechunk.s3_storage(
bucket='carbonplan-ocr',
prefix=f'output/fire-risk/tensor/{setup}/v{version}/ocr.icechunk',
from_env=True,
)
repo = icechunk.Repository.open(storage)
session = repo.readonly_session('main')
ds = xr.open_zarr(session.store, consolidated=False, zarr_format=3)
Riley et al. burn probability#
# --- read in
riley_2011_30m_4326 = catalog.get_dataset('riley-et-al-2025-2011-30m-4326').to_xarray()[['BP']]
# there are slight mismatches with the coordinates in `ds`
# interpolate riley data to exactly match ds coordinates
riley_interp = riley_2011_30m_4326.interp(
latitude=ds.latitude, longitude=ds.longitude, method='nearest'
)
# assign coordinates
ds['riley_BP_2011'] = riley_interp.BP
# create burnable mask
ds['riley_burnable_mask'] = xr.where(ds['riley_BP_2011'] > 0, 1, 0)
Historical fire perimeters#
Filter the data#
The Inter Agency Fire Perimeter History dataset includes small fires (where reported fire size limits are set by each reporting agency) and prescribed burns.
We filter out small fires in an effort to omit prescribed and more manageable fires. Based on the National Interagency Fire Center data, mean prescribed burns are around 30-50 acres (total fires / total acres). To omit most of these burns, we filter out all fire perimeters with an area less than 75 acres. (Note that there does not appear to be an input marking prescribed burns in the data).
# --- read in historical fire perimeter data (note, it can take a couple mins to load)
fp_path = 's3://carbonplan-ocr/evaluation/'
fp_name = 'InterAgencyFirePerimeterHistory_All_Years_View_-104997095188071827.gpkg'
gdf = gpd.read_file(os.path.join(fp_path, fp_name))
# convert CRS
gdf = gdf.to_crs('EPSG:4326')
# --- filter data
min_acres = 75
gdf = gdf[gdf['GIS_ACRES'] > min_acres]
Add burn sum and mask and merge with ds#
# --- define transform for rasterization
# Assuming lon/lat are 1D arrays
transform = (
rasterio.transform.from_bounds( # assumes first row in raster corresponds to max latitude
ds.longitude.min(),
ds.latitude.max(),
ds.longitude.max(),
ds.latitude.min(),
len(ds.longitude),
len(ds.latitude),
)
)
# --- compute burn sum
# create (geometry, value) tuples
shapes = [(geom, 1) for geom in gdf.geometry]
# burn all at once
burn_sum = rasterio.features.rasterize(
[(geom, 1) for geom in gdf.geometry],
out_shape=(len(ds.latitude), len(ds.longitude)),
transform=transform,
all_touched=True,
dtype=np.uint16, # allows counts > 255
merge_alg=rasterio.features.MergeAlg.add, # sum overlapping polygons
)
# add to dataset
ds['burn_sum'] = (('latitude', 'longitude'), burn_sum)
# get burn mask
ds['burn_mask'] = xr.where(ds['burn_sum'] > 0, 1, 0)
Save result#
# --- delete everything we don't need
del burn_sum, shapes, transform, gdf, riley_interp, riley_2011_30m_4326
gc.collect()
0
# --- rechunk
for name, var in ds.data_vars.items():
ds[name] = var.chunk({'latitude': 6000, 'longitude': 4500})
s3_path = f's3://carbonplan-ocr/evaluation/{version}'
savename = 'benchmarking-input-dat.zarr'
# write to S3
ds.to_zarr(
os.path.join(s3_path, savename),
mode='w', # overwrite (use "a" to append)
compute=True,
storage_options={'anon': False}, # set to True if public bucket
)
/opt/coiled/env/lib/python3.13/site-packages/zarr/api/asynchronous.py:247: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.
warnings.warn(
<xarray.backends.zarr.ZarrStore at 0xeead0c7ae7a0>
Add cell area data and save var#
# add variable for grid cell areas
R = 6371000 # Earth radius (m)
dlat = np.deg2rad(np.abs(ds.latitude.values[1] - ds.latitude.values[0]))
dlon = np.deg2rad(np.abs(ds.longitude.values[1] - ds.longitude.values[0]))
# area for each latitude band (broadcast over lon)
area = R**2 * dlat * dlon * np.cos(np.deg2rad(ds.latitude.values)) # 1D over lat
area = xr.DataArray(area, coords=[ds.latitude.values], dims=['latitude']).broadcast_like(ds)
# convert m2 to km2
area = area / 1e6
# add var
ds['cell_area'] = area
ds['cell_area'].attrs['units'] = 'km^2'
# write only this variable to the existing store
ds[['cell_area']].to_zarr(os.path.join(s3_path, savename), mode='a') # append mode!
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xeeacdff99450>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xeeace3823650>, 1878.974127614)])']
connector: <aiohttp.connector.TCPConnector object at 0xeeacc00aac10>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xeeacdff9ac10>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xeeace3822210>, 1878.977021968)])']
connector: <aiohttp.connector.TCPConnector object at 0xeeacc00aa990>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0xeeacc00aaad0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0xeeada7c01a30>, 1878.978593585)])']
connector: <aiohttp.connector.TCPConnector object at 0xeeacc00aad50>
/opt/coiled/env/lib/python3.13/site-packages/zarr/api/asynchronous.py:247: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.
warnings.warn(
<xarray.backends.zarr.ZarrStore at 0xeeac6636e5c0>
# ------------------------