Cleaning

Set up

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import data

df <- read_csv("data/hiv.csv")

Rows: 1090 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (14): Series Name, Series Code, Country Name, Country Code, 2014 [YR2014...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect data

glimpse(df)

Rows: 1,090
Columns: 14
$ `Series Name`   <chr> "Prevalence of HIV, total (% of population ages 15-49)…
$ `Series Code`   <chr> "SH.DYN.AIDS.ZS", "SH.DYN.AIDS.ZS", "SH.DYN.AIDS.ZS", …
$ `Country Name`  <chr> "Afghanistan", "Albania", "Algeria", "American Samoa",…
$ `Country Code`  <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO", "ATG", "ARG"…
$ `2014 [YR2014]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.9", "..", "0.4", "…
$ `2015 [YR2015]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.8", "..", "0.4", "…
$ `2016 [YR2016]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.8", "..", "0.4", "…
$ `2017 [YR2017]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.7", "..", "0.4", "…
$ `2018 [YR2018]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.7", "..", "0.4", "…
$ `2019 [YR2019]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.6", "..", "0.4", "…
$ `2020 [YR2020]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.6", "..", "0.4", "…
$ `2021 [YR2021]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.5", "..", "0.4", "…
$ `2022 [YR2022]` <chr> "0.1", "0.1", "0.1", "..", "..", "1.5", "..", "0.4", "…
$ `2023 [YR2023]` <chr> "..", "..", "..", "..", "..", "..", "..", "..", "..", …

Data cleaning

# Invalid data ("..") are marked as "NA"
df[df == ".."] <- NA

glimpse(df)

Rows: 1,090
Columns: 14
$ `Series Name`   <chr> "Prevalence of HIV, total (% of population ages 15-49)…
$ `Series Code`   <chr> "SH.DYN.AIDS.ZS", "SH.DYN.AIDS.ZS", "SH.DYN.AIDS.ZS", …
$ `Country Name`  <chr> "Afghanistan", "Albania", "Algeria", "American Samoa",…
$ `Country Code`  <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO", "ATG", "ARG"…
$ `2014 [YR2014]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.9", NA, "0.4", "0.2", …
$ `2015 [YR2015]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.8", NA, "0.4", "0.2", …
$ `2016 [YR2016]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.8", NA, "0.4", "0.2", …
$ `2017 [YR2017]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.7", NA, "0.4", "0.2", …
$ `2018 [YR2018]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.7", NA, "0.4", "0.3", …
$ `2019 [YR2019]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.6", NA, "0.4", "0.3", …
$ `2020 [YR2020]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.6", NA, "0.4", "0.3", …
$ `2021 [YR2021]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.5", NA, "0.4", "0.3", …
$ `2022 [YR2022]` <chr> "0.1", "0.1", "0.1", NA, NA, "1.5", NA, "0.4", "0.3", …
$ `2023 [YR2023]` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

df_hiv <- df |>
  rename(country = `Country Name`,
         country_code = `Country Code`,
         series = `Series Name`,
         series_code = `Series Code`) |> # Change the column names
  pivot_longer(cols = 5:13,
               names_to = "year",
               values_to = "hiv") |> # Convert the wide format into a long format
  mutate(year = as.integer(str_sub(year, start = 1, end = 4))) |> # Change data type
  mutate(hiv = as.numeric(hiv)) |> #Change data type
  select(country, country_code, year, series, hiv)

glimpse(df_hiv)

Rows: 9,810
Columns: 5
$ country      <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan…
$ country_code <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "…
$ year         <int> 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 201…
$ series       <chr> "Prevalence of HIV, total (% of population ages 15-49)", …
$ hiv          <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.…

Save the cleaned data

#  Save cleaned data "df_hiv" to .RData
save(df_hiv, file = "data/hivdata.RData")