Saving MHW Results to NetCDF

Robert W Schlegel

2021-10-26

Overview

In the previous vignette we saw how to detect marine heatwaves (MHWs) in gridded data. In this vignette we will use those gridded MHW results to see how to save them as a NetCDF file.

library(dplyr) # For basic data manipulation
library(ncdf4) # For creating NetCDF files
library(tidync) # For easily dealing with NetCDF data
library(ggplot2) # For visualising data
library(doParallel) # For parallel processing

Loading data

Please see the downloading and preparing OISST data and detecting marine heatwaves (MHWs) in gridded data vignettes first if you have not yet worked through them. We will be using the MHW results created from those two vignettes below.

MHW_res_grid <- readRDS("~/Desktop/MHW_result.Rds")

Prepare the data

The main sticking point between MHW results and the NetCDF file format in R is that NetCDF files will only accept the file type in R known as “arrays”, and most R outputs are data.frames. So the majority of what we need to do is to convert our data from data.frames to arrays. There are many ways to do this and a search of the interwebs will produce a plethora of results. Over the years I have settled into an approach that I use operationally for the MHW Tracker that I will also use here. It is not necessarily the fastest method, nor does it use the fewest lines of code possible, but it does follow the tidyverse approach to programming, and I think it is about as transparent as this process can be. We will create two functions below that will help us along in this process.

# Function for creating arrays from data.frames
df_acast <- function(df, lon_lat){
  
  # Force grid
  res <- df %>%
    right_join(lon_lat, by = c("lon", "lat")) %>%
    arrange(lon, lat)
  
  # Convert date values to integers if they are present
  if(lubridate::is.Date(res[1,4])) res[,4] <- as.integer(res[,4])
  
  # Create array
  res_array <- base::array(res[,4], dim = c(length(unique(lon_lat$lon)), length(unique(lon_lat$lat))))
  dimnames(res_array) <- list(lon = unique(lon_lat$lon),
                              lat = unique(lon_lat$lat))
  return(res_array)
}

# Wrapper function for last step before data are entered into NetCDF files
df_proc <- function(df, col_choice){
  
  # Determine the correct array dimensions
  lon_step <- mean(diff(sort(unique(df$lon))))
  lat_step <- mean(diff(sort(unique(df$lat))))
  lon <- seq(min(df$lon), max(df$lon), by = lon_step)
  lat <- seq(min(df$lat), max(df$lat), by = lat_step)
  
  # Create full lon/lat grid
  lon_lat <- expand.grid(lon = lon, lat = lat) %>% 
    data.frame()
  
  # Acast only the desired column
  dfa <- plyr::daply(df[c("lon", "lat", "event_no", col_choice)], 
                     c("event_no"), df_acast, .parallel = T, lon_lat = lon_lat)
  return(dfa)
}

# We must now run this function on each column of data we want to add to the NetCDF file
doParallel::registerDoParallel(cores = 7)
prep_dur <- df_proc(MHW_res_grid, "duration")
prep_max_int <- df_proc(MHW_res_grid, "intensity_max")
prep_cum_int <- df_proc(MHW_res_grid, "intensity_cumulative")
prep_peak <- df_proc(MHW_res_grid, "date_peak")

Create NetCDF shell

With our data prepared into a series of arrays, we now need to set the stage for our NetCDF file. The important thing here is that the dimensions of the arrays we created match up to the dimensions of the NetCDF file.

# Get file attributes
lon_step <- mean(diff(sort(unique(MHW_res_grid$lon))))
lat_step <- mean(diff(sort(unique(MHW_res_grid$lat))))
lon <- seq(min(MHW_res_grid$lon), max(MHW_res_grid$lon), by = lon_step)
lat <- seq(min(MHW_res_grid$lat), max(MHW_res_grid$lat), by = lat_step)
event_no <- seq(min(MHW_res_grid$event_no), max(MHW_res_grid$event_no), by = 1)
tunits <- "days since 1970-01-01"

# Length of each attribute
nlon <- length(lon)
nlat <- length(lat)
nen <- length(event_no)

Create NetCDF file

The last step in this process is to add the prepared data to the NetCDF shell we have constructed. The ncdf4 package helps to make this all a relatively straight forward process.

# Path and file name
  # NB: We are net setting time dimensions here because we are using event_no as our "time" dimension
ncpath <- "~/Desktop/"
ncname <- "MHW_results"
ncfname <- paste0(ncpath, ncname, ".nc")
# dname <- "tmp"

# Define dimensions
londim <- ncdim_def("lon", "degrees_east", lon)
latdim <- ncdim_def("lat", "degrees_north", lat)
endim <- ncdim_def("event_no", "event_number", event_no)
# timedim <- ncdim_def("time", tunits, as.double(time))

# Define variables
fillvalue <- 1e32
def_dur <- ncvar_def(name = "duration", units = "days", dim = list(endim, latdim, londim), 
                     missval = fillvalue, longname = "duration of MHW", prec = "double")
def_max_int <- ncvar_def(name = "max_int", units = "deg_c", dim = list(endim, latdim, londim), 
                         missval = fillvalue, longname = "maximum intensity during MHW", prec = "double")
def_cum_int <- ncvar_def(name = "cum_int", units = "deg_c days", dim = list(endim, latdim, londim), 
                         missval = fillvalue, longname = "cumulative intensity during MHW", prec = "double")
def_peak <- ncvar_def(name = "date_peak", units = tunits, dim = list(endim, latdim, londim),
                      missval = 0, longname = "date of peak temperature anomaly during MHW", prec = "integer")

# Create netCDF file and prepare space for our arrays
ncout <- nc_create(ncfname, list(def_peak, def_dur, def_max_int, def_cum_int), force_v4 = TRUE)

# Add the actual data
ncvar_put(ncout, def_dur, prep_dur)
ncvar_put(ncout, def_max_int, prep_max_int)
ncvar_put(ncout, def_cum_int, prep_cum_int)
ncvar_put(ncout, def_peak, prep_peak)

# Put additional attributes into dimension and data variables
ncatt_put(ncout, "lon", "axis", "X") #,verbose=FALSE) #,definemode=FALSE)
ncatt_put(ncout, "lat", "axis", "Y")
ncatt_put(ncout, "event_no", "axis", "event_no")

# Add global attributes
  # These are useful for whomever else may want to use your NetCDF file
ncatt_put(ncout, 0, "title", paste0("MHW results from lon: ",
                                    min(lon)," to ",max(lon),
                                    " and lat: ",min(lat)," to ",max(lat)))
ncatt_put(ncout, 0, "institution", "Your institution here!")
ncatt_put(ncout, 0, "source", "NOAA OISST v2.1")
ncatt_put(ncout,0, "references", "Banzon et al. (2020) J. Atmos. Oce. Tech. 37:341-349")
history <- paste0("Your name here!, ", date())
ncatt_put(ncout, 0, "history", history)
ncatt_put(ncout, 0, "Conventions", "Hobday et al. (2016)") # Assuming one has used the Hobday definition

# Get a summary of the created file:
ncout

Visualising the results

Let’s not stop there. To ensure that the NetCDF file was created correctly we want to load it back into our workspace and plot the results.

# Convenience function for comparing files
quick_grid <- function(df, var_choice){
  df %>% 
    filter(event_no == 13) %>% 
    ggplot(aes(x = lon, y = lat)) +
    geom_raster(aes_string(fill = var_choice)) +
    coord_cartesian(expand = F) +
    scale_fill_viridis_c() +
    theme(legend.position = "bottom")
}

# Thanks to the tidync package, loading the gridded data is very simple
MHW_res_nc <- tidync("~/Desktop/MHW_results.nc") %>% 
  hyper_tibble() %>% 
  mutate(date_peak = as.Date(date_peak, origin = "1970-01-01"))

# Plot the duration results
quick_grid(MHW_res_grid, "duration")
quick_grid(MHW_res_nc, "duration")

# Cumulative intensity
quick_grid(MHW_res_grid, "intensity_cumulative")
quick_grid(MHW_res_nc, "cum_int")