Today I want to test some ways to deal with implicit missing values: namely, creating grids with several commands and performing full joins on our data. Let’s use again COVID-19 vaccinations data in Italy, available from the official repo.
Load Data
url_vaccinations <-
'https://raw.githubusercontent.com/italia/covid19-opendata-vaccini/master/dati/somministrazioni-vaccini-latest.csv'
read_csv(url_vaccinations,
col_types = cols(
# parse as dates
data_somministrazione = "D",
# parse as factors
fornitore = "f",
area = "f",
fascia_anagrafica = "f"
# the rest, let it be guessed
)) %>%
# remove 'categoria' from several column names
rename_with( ~ stringr::str_remove(.x, 'categoria_')) %>%
# shorten some other variable names
rename(
operatori_sanitari = operatori_sanitari_sociosanitari,
data = data_somministrazione
) %>%
# create a new column with total vaccinations
mutate(nuovi_vaccinati = sesso_maschile + sesso_femminile) %>%
# reorder columns
relocate(nuovi_vaccinati, .after = 'fascia_anagrafica') ->
vaccinations
vaccinations %>% head(9)
## # A tibble: 9 x 17
## data fornitore area fascia_anagrafi… nuovi_vaccinati sesso_maschile
## <date> <fct> <fct> <fct> <dbl> <dbl>
## 1 2020-12-27 Pfizer/B… ABR 20-29 1 1
## 2 2020-12-27 Pfizer/B… ABR 30-39 4 1
## 3 2020-12-27 Pfizer/B… ABR 40-49 7 1
## 4 2020-12-27 Pfizer/B… ABR 50-59 9 4
## 5 2020-12-27 Pfizer/B… ABR 60-69 14 10
## 6 2020-12-27 Pfizer/B… ABR 70-79 1 1
## 7 2020-12-27 Pfizer/B… ABR 80-89 1 1
## 8 2020-12-27 Pfizer/B… BAS 20-29 9 4
## 9 2020-12-27 Pfizer/B… BAS 30-39 28 10
## # … with 11 more variables: sesso_femminile <dbl>, operatori_sanitari <dbl>,
## # personale_non_sanitario <dbl>, ospiti_rsa <dbl>, over80 <dbl>,
## # prima_dose <dbl>, seconda_dose <dbl>, codice_NUTS1 <chr>,
## # codice_NUTS2 <chr>, codice_regione_ISTAT <dbl>, nome_area <chr>
Create Grids
To fill implicit missing values, we need to check against every possible combinations of factors and dates in our data. This means creating a grid and then performing a full join.
expand_grid(
data = seq.Date(from = min(vaccinations$data),
to = max(vaccinations$data),
by = 'day'),
area = forcats::fct_unique(vaccinations$area),
fornitore = forcats::fct_unique(vaccinations$fornitore),
fascia_anagrafica = forcats::fct_unique(vaccinations$fascia_anagrafica),
) %>% head()
## # A tibble: 6 x 4
## data area fornitore fascia_anagrafica
## <date> <fct> <fct> <fct>
## 1 2020-12-27 ABR Pfizer/BioNTech 20-29
## 2 2020-12-27 ABR Pfizer/BioNTech 30-39
## 3 2020-12-27 ABR Pfizer/BioNTech 40-49
## 4 2020-12-27 ABR Pfizer/BioNTech 50-59
## 5 2020-12-27 ABR Pfizer/BioNTech 60-69
## 6 2020-12-27 ABR Pfizer/BioNTech 70-79
The same can be achieved in this way:
vaccinations %>%
expand(
# create a full sequence between the first and last date
full_seq(data, 1),
area, fornitore, fascia_anagrafica)
## # A tibble: 13,608 x 4
## `full_seq(data, 1)` area fornitore fascia_anagrafica
## <date> <fct> <fct> <fct>
## 1 2020-12-27 ABR Pfizer/BioNTech 20-29
## 2 2020-12-27 ABR Pfizer/BioNTech 30-39
## 3 2020-12-27 ABR Pfizer/BioNTech 40-49
## 4 2020-12-27 ABR Pfizer/BioNTech 50-59
## 5 2020-12-27 ABR Pfizer/BioNTech 60-69
## 6 2020-12-27 ABR Pfizer/BioNTech 70-79
## 7 2020-12-27 ABR Pfizer/BioNTech 80-89
## 8 2020-12-27 ABR Pfizer/BioNTech 90+
## 9 2020-12-27 ABR Pfizer/BioNTech 16-19
## 10 2020-12-27 ABR Moderna 20-29
## # … with 13,598 more rows
Much neater, uh? Still, there are very many values and once the data gets larger it will take some more time.
vaccinations %>%
full_join(vaccinations %>% expand(data, area, fornitore, fascia_anagrafica),
by = c('data', 'area', 'fornitore', 'fascia_anagrafica')) %>%
# sort data
arrange(area, data) %>%
# replace NAs that popped up
mutate(across(where(is.numeric), ~ replace_na(.x, 0))) %>%
# don't know why it does not work
mutate(fascia_anagrafica = fct_inorder(fascia_anagrafica)) -> vaccinations_ita
vaccinations_ita %>% head(10)
## # A tibble: 10 x 17
## data fornitore area fascia_anagrafi… nuovi_vaccinati sesso_maschile
## <date> <fct> <fct> <fct> <dbl> <dbl>
## 1 2020-12-27 Pfizer/B… ABR 20-29 1 1
## 2 2020-12-27 Pfizer/B… ABR 30-39 4 1
## 3 2020-12-27 Pfizer/B… ABR 40-49 7 1
## 4 2020-12-27 Pfizer/B… ABR 50-59 9 4
## 5 2020-12-27 Pfizer/B… ABR 60-69 14 10
## 6 2020-12-27 Pfizer/B… ABR 70-79 1 1
## 7 2020-12-27 Pfizer/B… ABR 80-89 1 1
## 8 2020-12-27 Pfizer/B… ABR 90+ 0 0
## 9 2020-12-27 Pfizer/B… ABR 16-19 0 0
## 10 2020-12-27 Moderna ABR 20-29 0 0
## # … with 11 more variables: sesso_femminile <dbl>, operatori_sanitari <dbl>,
## # personale_non_sanitario <dbl>, ospiti_rsa <dbl>, over80 <dbl>,
## # prima_dose <dbl>, seconda_dose <dbl>, codice_NUTS1 <chr>,
## # codice_NUTS2 <chr>, codice_regione_ISTAT <dbl>, nome_area <chr>
This is a very great deal of observations, and as of this day there are more missing values than complete ones. This is because Moderna started deliveries more than two weeks later than Pfizer/BioNTech (as the European Center for Disease Prevention and Control approved the vaccine later) and there were also some days between the start of the campaign and approximately January 10th where no vaccines administered.
Now we want to create some smaller datasets to be able to streamline visualisations later.
Group Data by Age Range
Let’s group the data by data
, fornitore
and fascia_anagrafica
to see how vaccinations proceed within the same population range. Furthermore, compute the cumulative totals for each numeric variable.
vaccinations_ita %>%
group_by(data, fascia_anagrafica, fornitore) %>%
summarise(across(where(is.numeric), sum)) %>%
mutate(across(where(is.numeric), list(totale = ~ cumsum(.x)))) %>%
arrange(data, fornitore) %>%
rename(vaccinati_totale = nuovi_vaccinati_totale) -> vaccinations_by_age_ita
vaccinations_by_age_ita %>% head()
## # A tibble: 6 x 23
## # Groups: data, fascia_anagrafica [6]
## data fascia_anagrafi… fornitore nuovi_vaccinati sesso_maschile
## <date> <fct> <fct> <dbl> <dbl>
## 1 2020-12-27 20-29 Pfizer/B… 640 236
## 2 2020-12-27 30-39 Pfizer/B… 1023 458
## 3 2020-12-27 40-49 Pfizer/B… 1428 537
## 4 2020-12-27 50-59 Pfizer/B… 2112 893
## 5 2020-12-27 60-69 Pfizer/B… 1429 1036
## 6 2020-12-27 70-79 Pfizer/B… 129 87
## # … with 18 more variables: sesso_femminile <dbl>, operatori_sanitari <dbl>,
## # personale_non_sanitario <dbl>, ospiti_rsa <dbl>, over80 <dbl>,
## # prima_dose <dbl>, seconda_dose <dbl>, codice_regione_ISTAT <dbl>,
## # vaccinati_totale <dbl>, sesso_maschile_totale <dbl>,
## # sesso_femminile_totale <dbl>, operatori_sanitari_totale <dbl>,
## # personale_non_sanitario_totale <dbl>, ospiti_rsa_totale <dbl>,
## # over80_totale <dbl>, prima_dose_totale <dbl>, seconda_dose_totale <dbl>,
## # codice_regione_ISTAT_totale <dbl>
What to do with this?
Before moving on to grouping the data, let’s see how many people by age range received a shot (and from which supplier) by age range over time.
vaccinations_by_age_ita %>%
ggplot(aes(data, nuovi_vaccinati, fill = fornitore)) +
geom_col() +
facet_wrap(~ fascia_anagrafica) +
scale_fill_viridis_d(begin = 0.75, end = 0.25) +
theme_minimal()
We can also obtain the totals by age range:
vaccinations_by_age_ita %>%
# remove cumulative sums, as they will be obtained via `summarise`:
select(1:12) %>%
group_by(fascia_anagrafica, fornitore) %>%
summarise(across(where(is.numeric), sum)) %>%
rename(vaccinati_totale = nuovi_vaccinati) -> totals_by_age_ita
totals_by_age_ita %>% head()
## # A tibble: 6 x 11
## # Groups: fascia_anagrafica [3]
## fascia_anagrafi… fornitore vaccinati_totale sesso_maschile sesso_femminile
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 20-29 Pfizer/B… 193924 65151 128773
## 2 20-29 Moderna 1427 578 849
## 3 30-39 Pfizer/B… 308332 121179 187153
## 4 30-39 Moderna 1897 807 1090
## 5 40-49 Pfizer/B… 383252 125121 258131
## 6 40-49 Moderna 2332 1025 1307
## # … with 6 more variables: operatori_sanitari <dbl>,
## # personale_non_sanitario <dbl>, ospiti_rsa <dbl>, over80 <dbl>,
## # prima_dose <dbl>, seconda_dose <dbl>
See how first and second shots are distributed by age range and supplier. Arguably, this is not very informative. But let’s wait a couple of months: I bet it will prove useful!
totals_by_age_ita %>%
# we don't need all of these columns
select(fascia_anagrafica, fornitore, prima_dose, seconda_dose) %>%
# shift to long format and exclude cols that can't be pivoted
pivot_longer(!c(fascia_anagrafica, fornitore),
# give names to the new cols
names_to = 'values',
values_to = 'counts') %>%
ggplot(aes(fornitore, counts, fill = values)) +
geom_col() +
facet_wrap(~ fascia_anagrafica) +
scale_fill_viridis_d(begin = 0.55, end = 0.95) +
theme_minimal() +
labs(
title = 'Number of shots by supplier, per age range',
x = 'date',
y = NULL,
fill = NULL
) +
theme(legend.position = 'bottom')
We can also picture the data as percentages:
totals_by_age_ita %>%
# we don't need all of these columns
mutate(across(
# across these two cols
c(prima_dose, seconda_dose),
# the new cols will be called {colname}.pct
list(pct = ~ .x / (prima_dose + seconda_dose))
)) %>%
select(fascia_anagrafica, fornitore, prima_dose_pct, seconda_dose_pct) %>%
# shift to long format and exclude cols that can't be pivoted
pivot_longer(!c(fascia_anagrafica, fornitore),
# give names to the new cols
names_to = 'values',
values_to = 'counts') %>%
ggplot(aes(fornitore, counts, fill = values)) +
geom_col() +
facet_wrap(~ fascia_anagrafica) +
scale_fill_viridis_d(begin = 0.55, end = 0.95) +
scale_y_continuous(labels = scales::percent) +
theme_minimal() +
labs(
title = '% of shots by supplier, per age range',
x = 'date',
y = NULL,
fill = NULL
) +
theme(legend.position = 'bottom')
Does it really tell something? Well, it might not. But these posts are meant to document the process to find the better one and, of course, practice! We can do better using the time series data:
vaccinations_by_age_ita %>%
select(data, fascia_anagrafica, fornitore, prima_dose, seconda_dose) %>%
pivot_longer(
!c(data,fascia_anagrafica, fornitore),
names_to = 'values',
values_to = 'counts'
) %>%
ggplot(aes(data, counts, fill = values)) +
geom_col() +
facet_wrap(~ fascia_anagrafica) +
scale_fill_viridis_d(begin = 0.55, end = 0.95) +
theme_minimal() +
labs(
title = 'Vaccine Shots by Age Range',
x = 'date',
y = NULL,
fill = NULL
) +
theme(legend.position = 'bottom')
This does not tell much either: the additional dimension of the doses is not enriched by these additional dimensions (but it’s pretty good looking, is it not?).
Group Data By Area
Let’s wrap up by grouping the data by area
.
vaccinations_ita %>%
group_by(data, area, fornitore) %>%
# create new columns with the sum of the grouped rows
summarise(across(where(is.numeric), sum)) %>%
# reorder
relocate(nuovi_vaccinati, .after = area) %>%
# create new rows with cumulative sums
mutate(across(where(is.numeric), list(totale = ~ cumsum(.x)))) %>%
rename(vaccinati_totale = nuovi_vaccinati_totale) %>%
arrange(area) -> vaccinations_by_area_ita
Before grouping again to get the totals, we want to enrich this data with the population and deliveries data. This script is a bit more elaborate:
read_csv(
'https://raw.githubusercontent.com/orizzontipolitici/covid19-vaccine-data/main/data_ita/doses_by_area_ita.csv'
) -> doses_by_area
vaccinations_by_area_ita %>%
# `doses_by_area` is wide in format: we don't need `fornitore`
select(1:12, -fornitore) %>%
group_by(area) %>%
summarise(across(where(is.numeric), sum)) %>%
rename(vaccinati_totale = nuovi_vaccinati) %>%
# also, it has the column for Italy, so we have to add one new line
add_row(area = 'ITA',
# the `.` indicates the object itself!
vaccinati_totale = sum(.$vaccinati_totale),
sesso_maschile = sum(.$sesso_maschile),
sesso_femminile = sum(.$sesso_femminile),
operatori_sanitari = sum(.$operatori_sanitari),
personale_non_sanitario = sum(.$personale_non_sanitario),
ospiti_rsa = sum(.$ospiti_rsa),
over80 = sum(.$over80),
prima_dose = sum(.$prima_dose),
seconda_dose = sum(.$seconda_dose),
) %>%
# perform inner join by area!
inner_join(doses_by_area, by = 'area') %>%
relocate(
nome, NUTS2, area, popolazione_2020
) %>%
# create new columns:
mutate(
# vaccinated each 1000 people
vaccinati_ogni_mille = round(vaccinati_totale / popolazione_2020 * 1000, digits = 2),
# share of vaccines used out of all received
percent_vaccini_usati = round(vaccinati_totale / totale_dosi * 100, digits = 2)
) -> totals_by_area_ita
totals_by_area_ita
## # A tibble: 22 x 19
## nome NUTS2 area popolazione_2020 vaccinati_totale sesso_maschile
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Abru… ITF1 ABR 1293941 29268 10722
## 2 Basi… ITF5 BAS 553254 16609 6735
## 3 Cala… ITF6 CAL 1894110 42925 20822
## 4 Camp… ITF3 CAM 5712143 175906 85039
## 5 Emil… ITH5 EMR 4464119 195438 65908
## 6 Friu… ITH4 FVG 1206216 49170 16461
## 7 Lazio ITI4 LAZ 5755700 186682 72315
## 8 Ligu… ITC3 LIG 1524826 52122 18426
## 9 Lomb… ITC4 LOM 10027602 299631 98518
## 10 Marc… ITI3 MAR 1512672 42841 15369
## # … with 12 more rows, and 13 more variables: sesso_femminile <dbl>,
## # operatori_sanitari <dbl>, personale_non_sanitario <dbl>, ospiti_rsa <dbl>,
## # over80 <dbl>, prima_dose <dbl>, seconda_dose <dbl>, totale_pfizer <dbl>,
## # totale_moderna <dbl>, totale_dosi <dbl>, dosi_ogni_mille <dbl>,
## # vaccinati_ogni_mille <dbl>, percent_vaccini_usati <dbl>