# Prepare joined dataset for continuous variable comparison.
# tbep_dat: output of anlz_ambiscr (has PercentG1-G5, BC, AreaAbbr)
# epchc_dat: AMBI3 or AMBI-TB3 Excel file (has PercentG1-G5, BioticCoefficient)
prep_continuous <- function(tbep_dat, epchc_dat) {
tbep_side <- tbep_dat |>
filter(!is.na(BC), BC != 7, AreaAbbr %in% segs) |>
mutate(AreaAbbr = factor(AreaAbbr, levels = segs)) |>
select(yr, StationNumber, AreaAbbr,
PercentG1_tbep = PercentG1, PercentG2_tbep = PercentG2,
PercentG3_tbep = PercentG3, PercentG4_tbep = PercentG4,
PercentG5_tbep = PercentG5, BC_tbep = BC)
epchc_side <- epchc_dat |>
mutate(yr = as.integer(Year)) |>
select(yr, StationNumber,
PercentG1_epchc = PercentG1, PercentG2_epchc = PercentG2,
PercentG3_epchc = PercentG3, PercentG4_epchc = PercentG4,
PercentG5_epchc = PercentG5,
BC_epchc = BioticCoefficient) |>
mutate(across(starts_with('Percent'), \(x) coalesce(x, 0)))
inner_join(tbep_side, epchc_side, by = c('yr', 'StationNumber')) |>
filter(!is.na(BC_epchc)) |>
pivot_longer(
cols = -c(yr, StationNumber, AreaAbbr),
names_to = c('variable', '.value'),
names_pattern = '(.+)_(tbep|epchc)'
) |>
mutate(
variable = factor(variable,
levels = c('PercentG1', 'PercentG2', 'PercentG3',
'PercentG4', 'PercentG5', 'BC'))
)
}
# Prepare joined dataset for categorical variable comparison.
# tbep_dat: output of anlz_ambiscr
# epchc_dat: AMBI4 or AMBI-TB4 Excel file
prep_categorical <- function(tbep_dat, epchc_dat) {
tbep_side <- tbep_dat |>
filter(AreaAbbr %in% segs) |>
mutate(AreaAbbr = factor(AreaAbbr, levels = segs)) |>
select(yr, StationNumber, AreaAbbr,
SitePollutionClassification, BioticIndex,
DominatingEcologicalGroup, BenthicCommunityHealth)
epchc_side <- epchc_dat |>
mutate(
yr = as.integer(Year),
BioticIndex = as.character(BioticIndex)
) |>
select(yr, StationNumber,
SPC_epchc = SitePollutionClassification,
BI_epchc = BioticIndex,
DEG_epchc = DominatingEcologicalGroup,
BCH_epchc = BenthicCommunityHealth)
inner_join(tbep_side, epchc_side, by = c('yr', 'StationNumber'))
}
# Confusion matrix as a list of flextables, one per bay segment.
# Diagonal cells (TBEP == EPCHC) are highlighted in blue.
table_confusion <- function(dat, tbep_col, epchc_col) {
dat_filt <- dat |>
filter(
!is.na(.data[[tbep_col]]), .data[[tbep_col]] != 'Azoic',
!is.na(.data[[epchc_col]]), .data[[epchc_col]] != 'Azoic'
)
all_cats <- sort(unique(c(dat_filt[[tbep_col]], dat_filt[[epchc_col]])))
segs_present <- intersect(segs, as.character(unique(dat_filt$AreaAbbr)))
lapply(segs_present, function(seg) {
ct <- dat_filt |>
filter(AreaAbbr == seg) |>
count(TBEP = .data[[tbep_col]], EPCHC = .data[[epchc_col]]) |>
complete(TBEP = all_cats, EPCHC = all_cats, fill = list(n = 0L)) |>
pivot_wider(names_from = EPCHC, values_from = n, values_fill = 0L)
epchc_cols <- setdiff(names(ct), 'TBEP')
ft <- flextable(ct) |>
set_header_labels(TBEP = '') |>
add_header_row(
values = c('TBEP', 'EPCHC'),
colwidths = c(1L, length(epchc_cols))
) |>
add_header_lines(values = paste('Bay Segment:', seg)) |>
align(align = 'center', part = 'all') |>
align(j = 1L, align = 'left', part = 'body') |>
bold(bold = FALSE, part = 'header') |>
autofit()
for (cat in all_cats) {
ri <- which(ct$TBEP == cat)
ci <- which(names(ct) == cat)
if (length(ri) > 0L && length(ci) > 0L)
ft <- bg(ft, i = ri, j = ci, bg = '#BDD7EE')
}
ft
})
}