Potential carbon stocks

Code

library(terra)
library(sf)
library(randomForest)
library(tidyverse)

Prepare data

Get AGB values by year in undisturbed forests.

Code

area <- list.files("data/umrbpl/", full.names = TRUE, pattern = "shp$") |>
  read_sf() |>
  st_transform(crs = "EPSG:4326")
agb_maps <- list.files(
  "data/biomass/", "umrbpl_ESACCI-BIOMASS-L4-AGB-MERGED",
  full.names = TRUE
) |>
  rast() |>
  mask(area)
years <- list.files("data/biomass/", "umrbpl_ESACCI-BIOMASS-L4-AGB-MERGED") |>
  strsplit("-") |>
  lapply(function(x) as.numeric(x[7])) |>
  unlist()
tmf_maps <- list.files("data/tmf/", "UMRBPL", full.names = TRUE) |>
  grep(pattern = paste(years, collapse = "|"), value = TRUE) |>
  rast() |>
  # resample to agb resolution
  resample(agb_maps, method = "modal") |>
  mask(area)
names(agb_maps) <- years
names(tmf_maps) <- gsub("Dec", "", names(tmf_maps))
coords <- data.frame(crds(agb_maps)[sample(nrow(crds(agb_maps)), 1e4), ])

data <- lapply(list(agb_maps, tmf_maps), function(r) {
  terra::extract(r, coords, ID = FALSE, xy = TRUE) |>
    pivot_longer(cols = as.character(years), names_to = "year") |>
    mutate(year = as.numeric(year))
}) |>
  set_names(c("agb", "tmf")) |>
  bind_rows(.id = "data") |>
  pivot_wider(
    id_cols = c("x", "y", "year"),
    names_from = "data", values_from = "value"
  )

Explanatory variables

Code

set.seed(123)
data <- c(
  "data/srtm/topo_variables.tif",
  "data/chelsa/climate_variables.tif",
  "data/wind/UMRBPL_wind-speed_10m.tif",
  "data/osm/dist_to_road.tif",
  "data/osm/dist_to_water.tif"
) |>
  lapply(function(r) {
    r |>
      rast() |>
      mask(area) |>
      resample(agb_maps, method = "mean")
  }) |>
  rast() |>
  terra::extract(data[, c("x", "y")], ID = FALSE) |>
  rename(dist_to_road = elevation.1) |>
  rename(dist_to_water = elevation.2) |>
  bind_cols(data)

Random forest

Code

rf_data <- data |>
  filter(year == 2022) |>
  rename(wind = `PHL_wind-speed_10m`) |>
  select(
    agb, elevation, slope, aspect, twi, mean_temperature,
    mean_precipitation, wind, tmf, dist_to_road, dist_to_water
  ) |>
  drop_na()
type <- c("cal", "eval")[1 + rbinom(nrow(rf_data), 1, 0.2)]

Code

rf_agb <- randomForest(agb ~ .,
  data = rf_data[type == "cal", ],
  importance = TRUE
)
save(rf_agb, file = "models/rf_agb.rda")

Code

load("models/rf_agb.rda")
predictions <- predict(rf_agb, newdata = rf_data[type == "eval", ])
errors <- rf_data$agb[type == "eval"] - predictions
rmse <- sqrt(mean(errors^2))
print(paste("RMSE:", round(rmse), "Mg/ha"))

[1] "RMSE: 38 Mg/ha"

Code

r_squared_oob <- rf_agb$rsq[length(rf_agb$rsq)] * 100
print(paste(
  "Out-of-Bag R-squared (% Var Explained):",
  round(r_squared_oob), "%"
))

[1] "Out-of-Bag R-squared (% Var Explained): 51 %"

Code

# Calculate R-squared on the test set
ss_total <- sum((rf_data$agb[type == "eval"] -
                   mean(rf_data$agb[type == "eval"]))^2)
ss_residual <- sum((rf_data$agb[type == "eval"] - predictions)^2)
r_squared_test <- 1 - (ss_residual / ss_total)
print(paste("Test Set R-squared:", round(r_squared_test * 100), "%"))

[1] "Test Set R-squared: 61 %"

Code

varImpPlot(rf_agb)

The partial dependence plots are represented below.

Code

data_all <- data |>
  filter(year == 2022) |>
  rename(wind = `PHL_wind-speed_10m`) |>
  select(
    agb, elevation, slope, aspect, twi, mean_temperature,
    mean_precipitation, wind, tmf, dist_to_road, dist_to_water
  ) |> drop_na()

preds <- lapply(colnames(data_all)[-1], function(x) {
  data.frame(
    value = seq(
      min(data_all[[x]]),
      max(data_all[[x]]),
      length.out = 50
    )
  )
})
names(preds) <- colnames(data_all)[-1]
preds <- bind_rows(preds, .id = "var")

partial <- function(model, data, variable, val, quants = c(0.1, 0.5, 0.9)) {
  new_data <- data
  new_data[, variable] <- val
  predict(model, new_data) |>
    quantile(probs = quants)
}
preds |> 
  group_by(var, value) |> 
  summarise(quant = c("Q10", "Q50", "Q90"), 
            pred = partial(rf_agb, select(data_all, -agb), var, value)) |>
  write.csv("models/rf_agb_partial.csv", row.names = FALSE)

Code

data_all <- data[sample(5000),] |>
  filter(year == 2022) |>
  rename(wind = `PHL_wind-speed_10m`) |>
  select(
    agb, elevation, slope, aspect, twi, mean_temperature,
    mean_precipitation, wind, tmf, dist_to_road, dist_to_water
  ) |> drop_na() |> 
  pivot_longer(!agb, names_to="var")
read.csv("models/rf_agb_partial.csv") |>
  pivot_wider(names_from = "quant", values_from = "pred") |>
  ggplot(aes(value, Q50)) +
  geom_ribbon(aes(ymin = Q10, ymax = Q90), alpha = 0.1) +
  geom_point(data = data_all, aes(y = agb), alpha = 0.2) +
  geom_line() +
  facet_wrap(~var, scales = "free") +
  theme_classic() +
  theme(legend.position = "none") +
  labs(x = "Variable value",
       y = "Distribution of predicted and observed SOC stocks")

Partial dependence plots showing the effect of individual variables on predicted SOC stocks. Solid lines represent the median predictions, while the grey areas represent the 10–90% quantiles. The points represent observations, with each colour representing a different site.

Prepare data

Random forest

Spatial cross validation