In the previous vignette we saw how to detect marine heatwaves (MHWs) in gridded data. In this vignette we will use those gridded MHW results to see how to save them as a NetCDF file.
library(dplyr) # For basic data manipulation
library(ncdf4) # For creating NetCDF files
library(tidync) # For easily dealing with NetCDF data
library(ggplot2) # For visualising data
library(doParallel) # For parallel processing
Please see the downloading and preparing OISST data and detecting marine heatwaves (MHWs) in gridded data vignettes first if you have not yet worked through them. We will be using the MHW results created from those two vignettes below.
<- readRDS("~/Desktop/MHW_result.Rds") MHW_res_grid
The main sticking point between MHW results and the NetCDF file format in R is that NetCDF files will only accept the file type in R known as “arrays”, and most R outputs are data.frames. So the majority of what we need to do is to convert our data from data.frames to arrays. There are many ways to do this and a search of the interwebs will produce a plethora of results. Over the years I have settled into an approach that I use operationally for the MHW Tracker that I will also use here. It is not necessarily the fastest method, nor does it use the fewest lines of code possible, but it does follow the tidyverse
approach to programming, and I think it is about as transparent as this process can be. We will create two functions below that will help us along in this process.
# Function for creating arrays from data.frames
<- function(df, lon_lat){
df_acast
# Force grid
<- df %>%
res right_join(lon_lat, by = c("lon", "lat")) %>%
arrange(lon, lat)
# Convert date values to integers if they are present
if(lubridate::is.Date(res[1,4])) res[,4] <- as.integer(res[,4])
# Create array
<- base::array(res[,4], dim = c(length(unique(lon_lat$lon)), length(unique(lon_lat$lat))))
res_array dimnames(res_array) <- list(lon = unique(lon_lat$lon),
lat = unique(lon_lat$lat))
return(res_array)
}
# Wrapper function for last step before data are entered into NetCDF files
<- function(df, col_choice){
df_proc
# Determine the correct array dimensions
<- mean(diff(sort(unique(df$lon))))
lon_step <- mean(diff(sort(unique(df$lat))))
lat_step <- seq(min(df$lon), max(df$lon), by = lon_step)
lon <- seq(min(df$lat), max(df$lat), by = lat_step)
lat
# Create full lon/lat grid
<- expand.grid(lon = lon, lat = lat) %>%
lon_lat data.frame()
# Acast only the desired column
<- plyr::daply(df[c("lon", "lat", "event_no", col_choice)],
dfa c("event_no"), df_acast, .parallel = T, lon_lat = lon_lat)
return(dfa)
}
# We must now run this function on each column of data we want to add to the NetCDF file
::registerDoParallel(cores = 7)
doParallel<- df_proc(MHW_res_grid, "duration")
prep_dur <- df_proc(MHW_res_grid, "intensity_max")
prep_max_int <- df_proc(MHW_res_grid, "intensity_cumulative")
prep_cum_int <- df_proc(MHW_res_grid, "date_peak") prep_peak
With our data prepared into a series of arrays, we now need to set the stage for our NetCDF file. The important thing here is that the dimensions of the arrays we created match up to the dimensions of the NetCDF file.
# Get file attributes
<- mean(diff(sort(unique(MHW_res_grid$lon))))
lon_step <- mean(diff(sort(unique(MHW_res_grid$lat))))
lat_step <- seq(min(MHW_res_grid$lon), max(MHW_res_grid$lon), by = lon_step)
lon <- seq(min(MHW_res_grid$lat), max(MHW_res_grid$lat), by = lat_step)
lat <- seq(min(MHW_res_grid$event_no), max(MHW_res_grid$event_no), by = 1)
event_no <- "days since 1970-01-01"
tunits
# Length of each attribute
<- length(lon)
nlon <- length(lat)
nlat <- length(event_no) nen
The last step in this process is to add the prepared data to the NetCDF shell we have constructed. The ncdf4
package helps to make this all a relatively straight forward process.
# Path and file name
# NB: We are net setting time dimensions here because we are using event_no as our "time" dimension
<- "~/Desktop/"
ncpath <- "MHW_results"
ncname <- paste0(ncpath, ncname, ".nc")
ncfname # dname <- "tmp"
# Define dimensions
<- ncdim_def("lon", "degrees_east", lon)
londim <- ncdim_def("lat", "degrees_north", lat)
latdim <- ncdim_def("event_no", "event_number", event_no)
endim # timedim <- ncdim_def("time", tunits, as.double(time))
# Define variables
<- 1e32
fillvalue <- ncvar_def(name = "duration", units = "days", dim = list(endim, latdim, londim),
def_dur missval = fillvalue, longname = "duration of MHW", prec = "double")
<- ncvar_def(name = "max_int", units = "deg_c", dim = list(endim, latdim, londim),
def_max_int missval = fillvalue, longname = "maximum intensity during MHW", prec = "double")
<- ncvar_def(name = "cum_int", units = "deg_c days", dim = list(endim, latdim, londim),
def_cum_int missval = fillvalue, longname = "cumulative intensity during MHW", prec = "double")
<- ncvar_def(name = "date_peak", units = tunits, dim = list(endim, latdim, londim),
def_peak missval = 0, longname = "date of peak temperature anomaly during MHW", prec = "integer")
# Create netCDF file and prepare space for our arrays
<- nc_create(ncfname, list(def_peak, def_dur, def_max_int, def_cum_int), force_v4 = TRUE)
ncout
# Add the actual data
ncvar_put(ncout, def_dur, prep_dur)
ncvar_put(ncout, def_max_int, prep_max_int)
ncvar_put(ncout, def_cum_int, prep_cum_int)
ncvar_put(ncout, def_peak, prep_peak)
# Put additional attributes into dimension and data variables
ncatt_put(ncout, "lon", "axis", "X") #,verbose=FALSE) #,definemode=FALSE)
ncatt_put(ncout, "lat", "axis", "Y")
ncatt_put(ncout, "event_no", "axis", "event_no")
# Add global attributes
# These are useful for whomever else may want to use your NetCDF file
ncatt_put(ncout, 0, "title", paste0("MHW results from lon: ",
min(lon)," to ",max(lon),
" and lat: ",min(lat)," to ",max(lat)))
ncatt_put(ncout, 0, "institution", "Your institution here!")
ncatt_put(ncout, 0, "source", "NOAA OISST v2.1")
ncatt_put(ncout,0, "references", "Banzon et al. (2020) J. Atmos. Oce. Tech. 37:341-349")
<- paste0("Your name here!, ", date())
history ncatt_put(ncout, 0, "history", history)
ncatt_put(ncout, 0, "Conventions", "Hobday et al. (2016)") # Assuming one has used the Hobday definition
# Get a summary of the created file:
ncout
Let’s not stop there. To ensure that the NetCDF file was created correctly we want to load it back into our workspace and plot the results.
# Convenience function for comparing files
<- function(df, var_choice){
quick_grid %>%
df filter(event_no == 13) %>%
ggplot(aes(x = lon, y = lat)) +
geom_raster(aes_string(fill = var_choice)) +
coord_cartesian(expand = F) +
scale_fill_viridis_c() +
theme(legend.position = "bottom")
}
# Thanks to the tidync package, loading the gridded data is very simple
<- tidync("~/Desktop/MHW_results.nc") %>%
MHW_res_nc hyper_tibble() %>%
mutate(date_peak = as.Date(date_peak, origin = "1970-01-01"))
# Plot the duration results
quick_grid(MHW_res_grid, "duration")
quick_grid(MHW_res_nc, "duration")
# Cumulative intensity
quick_grid(MHW_res_grid, "intensity_cumulative")
quick_grid(MHW_res_nc, "cum_int")