tidyr 패키지로 데이터를 정형화하는 방법을 숙지합니다.
tidyr 패키지는 세계보건기구(WHO, World Health Organization)에서 발표한 결핵 신규 환자 현황 데이터인 who
와 관련된 국가별 인구통계 데이터인 population
을 제공합니다.
# A tibble: 7,240 x 60
country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534
<chr> <chr> <chr> <int> <int> <int> <int>
1 Afghanistan AF AFG 1980 NA NA NA
2 Afghanistan AF AFG 1981 NA NA NA
3 Afghanistan AF AFG 1982 NA NA NA
4 Afghanistan AF AFG 1983 NA NA NA
5 Afghanistan AF AFG 1984 NA NA NA
6 Afghanistan AF AFG 1985 NA NA NA
7 Afghanistan AF AFG 1986 NA NA NA
8 Afghanistan AF AFG 1987 NA NA NA
9 Afghanistan AF AFG 1988 NA NA NA
10 Afghanistan AF AFG 1989 NA NA NA
# … with 7,230 more rows, and 53 more variables: new_sp_m3544 <int>,
# new_sp_m4554 <int>, new_sp_m5564 <int>, new_sp_m65 <int>,
# new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
# new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>,
# new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
# new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>,
# new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
# new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>,
# new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
# new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>,
# new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>,
# new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
# new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>,
# new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>,
# newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>,
# newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
# newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>,
# newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>,
# newrel_f65 <int>
# 한국의 인구통계
who %>%
filter(country %in% "Republic of Korea")
# A tibble: 34 x 60
country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534
<chr> <chr> <chr> <int> <int> <int> <int>
1 Republic o… KR KOR 1980 NA NA NA
2 Republic o… KR KOR 1981 NA NA NA
3 Republic o… KR KOR 1982 NA NA NA
4 Republic o… KR KOR 1983 NA NA NA
5 Republic o… KR KOR 1984 NA NA NA
6 Republic o… KR KOR 1985 NA NA NA
7 Republic o… KR KOR 1986 NA NA NA
8 Republic o… KR KOR 1987 NA NA NA
9 Republic o… KR KOR 1988 NA NA NA
10 Republic o… KR KOR 1989 NA NA NA
# … with 24 more rows, and 53 more variables: new_sp_m3544 <int>,
# new_sp_m4554 <int>, new_sp_m5564 <int>, new_sp_m65 <int>,
# new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
# new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>,
# new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
# new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>,
# new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
# new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>,
# new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
# new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>,
# new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>,
# new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
# new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>,
# new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>,
# newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>,
# newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
# newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>,
# newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>,
# newrel_f65 <int>
population
# A tibble: 4,060 x 3
country year population
<chr> <int> <int>
1 Afghanistan 1995 17586073
2 Afghanistan 1996 18415307
3 Afghanistan 1997 19021226
4 Afghanistan 1998 19496836
5 Afghanistan 1999 19987071
6 Afghanistan 2000 20595360
7 Afghanistan 2001 21347782
8 Afghanistan 2002 22202806
9 Afghanistan 2003 23116142
10 Afghanistan 2004 24018682
# … with 4,050 more rows
# 한국의 인구통계
population %>%
filter(country %in% "Republic of Korea")
# A tibble: 19 x 3
country year population
<chr> <int> <int>
1 Republic of Korea 1995 44652994
2 Republic of Korea 1996 44940974
3 Republic of Korea 1997 45220543
4 Republic of Korea 1998 45489131
5 Republic of Korea 1999 45742103
6 Republic of Korea 2000 45977210
7 Republic of Korea 2001 46192932
8 Republic of Korea 2002 46393993
9 Republic of Korea 2003 46591762
10 Republic of Korea 2004 46801310
11 Republic of Korea 2005 47033082
12 Republic of Korea 2006 47291491
13 Republic of Korea 2007 47572585
14 Republic of Korea 2008 47867970
15 Republic of Korea 2009 48164969
16 Republic of Korea 2010 48453931
17 Republic of Korea 2011 48732640
18 Republic of Korea 2012 49002683
19 Republic of Korea 2013 49262698
who 데이터 프레임의 변수 이름이 포맷에 벗어난 것이 있습니다. 이것을 바로 잡고 시작합니다.
앞에서 변수에 대한 설명에서 new_sp_m014 - new_rel_f65의 첫번째 코드는 진단방법(method of diagnosis)입니다. 그런데 진단방법의 rel의 경우에는 다른 변수 이름과는 달리 “new”라는 접두어와 언다라인(_)으로 분리되지 않습니다. 그래서 일관성을 유지하기 위해서 언다라인을 붙입니다.
names(who)
[1] "country" "iso2" "iso3" "year"
[5] "new_sp_m014" "new_sp_m1524" "new_sp_m2534" "new_sp_m3544"
[9] "new_sp_m4554" "new_sp_m5564" "new_sp_m65" "new_sp_f014"
[13] "new_sp_f1524" "new_sp_f2534" "new_sp_f3544" "new_sp_f4554"
[17] "new_sp_f5564" "new_sp_f65" "new_sn_m014" "new_sn_m1524"
[21] "new_sn_m2534" "new_sn_m3544" "new_sn_m4554" "new_sn_m5564"
[25] "new_sn_m65" "new_sn_f014" "new_sn_f1524" "new_sn_f2534"
[29] "new_sn_f3544" "new_sn_f4554" "new_sn_f5564" "new_sn_f65"
[33] "new_ep_m014" "new_ep_m1524" "new_ep_m2534" "new_ep_m3544"
[37] "new_ep_m4554" "new_ep_m5564" "new_ep_m65" "new_ep_f014"
[41] "new_ep_f1524" "new_ep_f2534" "new_ep_f3544" "new_ep_f4554"
[45] "new_ep_f5564" "new_ep_f65" "newrel_m014" "newrel_m1524"
[49] "newrel_m2534" "newrel_m3544" "newrel_m4554" "newrel_m5564"
[53] "newrel_m65" "newrel_f014" "newrel_f1524" "newrel_f2534"
[57] "newrel_f3544" "newrel_f4554" "newrel_f5564" "newrel_f65"
names(who) <- names(who) %>%
stringr::str_replace("newrel", "new_rel")
names(who)
[1] "country" "iso2" "iso3" "year"
[5] "new_sp_m014" "new_sp_m1524" "new_sp_m2534" "new_sp_m3544"
[9] "new_sp_m4554" "new_sp_m5564" "new_sp_m65" "new_sp_f014"
[13] "new_sp_f1524" "new_sp_f2534" "new_sp_f3544" "new_sp_f4554"
[17] "new_sp_f5564" "new_sp_f65" "new_sn_m014" "new_sn_m1524"
[21] "new_sn_m2534" "new_sn_m3544" "new_sn_m4554" "new_sn_m5564"
[25] "new_sn_m65" "new_sn_f014" "new_sn_f1524" "new_sn_f2534"
[29] "new_sn_f3544" "new_sn_f4554" "new_sn_f5564" "new_sn_f65"
[33] "new_ep_m014" "new_ep_m1524" "new_ep_m2534" "new_ep_m3544"
[37] "new_ep_m4554" "new_ep_m5564" "new_ep_m65" "new_ep_f014"
[41] "new_ep_f1524" "new_ep_f2534" "new_ep_f3544" "new_ep_f4554"
[45] "new_ep_f5564" "new_ep_f65" "new_rel_m014" "new_rel_m1524"
[49] "new_rel_m2534" "new_rel_m3544" "new_rel_m4554" "new_rel_m5564"
[53] "new_rel_m65" "new_rel_f014" "new_rel_f1524" "new_rel_f2534"
[57] "new_rel_f3544" "new_rel_f4554" "new_rel_f5564" "new_rel_f65"
pivot_wider()
함수는 Long 포맷 테이블을 Wide 포맷 테이블로 변환해주는 함수입니다.
rename_if()
함수는 조건을 만족하는 변수들의 변수 이름을 변경하는 dplyr 패키지의 함수입니다.
# A tibble: 4 x 3
country year population
<chr> <int> <int>
1 Republic of Korea 2010 48453931
2 Republic of Korea 2011 48732640
3 Republic of Korea 2012 49002683
4 Republic of Korea 2013 49262698
# 2.
population_wide <- population %>%
filter(country %in% "Republic of Korea") %>%
filter(year >= 2010) %>%
pivot_wider(names_from = "year",
values_from = "population")
population_wide
# A tibble: 1 x 5
country `2010` `2011` `2012` `2013`
<chr> <int> <int> <int> <int>
1 Republic of Korea 48453931 48732640 49002683 49262698
# 3.
population_wide2 <- population_wide %>%
rename_if(is.integer, function(x) paste0("y", x))
population_wide2
# A tibble: 1 x 5
country y2010 y2011 y2012 y2013
<chr> <int> <int> <int> <int>
1 Republic of Korea 48453931 48732640 49002683 49262698
pivot_longer()
함수는 Wide 포맷 테이블을 Long 포맷 테이블로 변환해주는 함수입니다.
population_long()
함수는 pivot_wider()
함수의 역함수입니다.
# 1.
population_long <- population_wide2 %>%
pivot_longer(y2010:y2013,
names_to = "year",
values_to = "population")
population_long
# A tibble: 4 x 3
country year population
<chr> <chr> <int>
1 Republic of Korea y2010 48453931
2 Republic of Korea y2011 48732640
3 Republic of Korea y2012 49002683
4 Republic of Korea y2013 49262698
# 2.
population_long2 <- population_long %>%
mutate(year = stringr::str_replace(year, "y", "") %>%
as.integer())
population_long2
# A tibble: 4 x 3
country year population
<chr> <int> <int>
1 Republic of Korea 2010 48453931
2 Republic of Korea 2011 48732640
3 Republic of Korea 2012 49002683
4 Republic of Korea 2013 49262698
tidyr 패키지
는 단독으로 사용되기보다는 dplyr 패키지
와 혼용되는 경우가 많습니다.
stringr::str_detect(group_code, "_m“)은”_m"이 포함될 경우에 TRUE를 반환합니다.
정규표현식 “[^[:digit:]]”은 숫자를 제외한 것과 패턴매칭됩니다.
# 1.1
ep_long <- who %>%
filter(country %in% "Republic of Korea") %>%
filter(year %in% "2010") %>%
select(new_ep_m014:new_ep_f65) %>%
pivot_longer(new_ep_m014:new_ep_f65,
names_to = "group_code",
values_to = "cases")
ep_long
# A tibble: 14 x 2
group_code cases
<chr> <int>
1 new_ep_m014 62
2 new_ep_m1524 511
3 new_ep_m2534 646
4 new_ep_m3544 690
5 new_ep_m4554 687
6 new_ep_m5564 571
7 new_ep_m65 1088
8 new_ep_f014 56
9 new_ep_f1524 439
10 new_ep_f2534 685
11 new_ep_f3544 644
12 new_ep_f4554 780
13 new_ep_f5564 584
14 new_ep_f65 1352
# A tibble: 1 x 1
total_cases
<int>
1 8795
# 2.1
ep_wide <- ep_long %>%
mutate(gender = case_when (
stringr::str_detect(group_code, "_m") ~ "male",
!stringr::str_detect(group_code, "_m") ~ "female")
) %>%
mutate(age_group = stringr::str_remove_all(group_code, "[^[:digit:]]")) %>%
mutate(age_group = paste("age", age_group, sep = "_")) %>%
select(-group_code) %>%
pivot_wider(names_from = "gender",
values_from = "cases")
# 2.2
ep_wide %>%
mutate(total = male + female) %>%
mutate(percent = round(total / sum(total) * 100, 2))
# A tibble: 7 x 5
age_group male female total percent
<chr> <int> <int> <int> <dbl>
1 age_014 62 56 118 1.34
2 age_1524 511 439 950 10.8
3 age_2534 646 685 1331 15.1
4 age_3544 690 644 1334 15.2
5 age_4554 687 780 1467 16.7
6 age_5564 571 584 1155 13.1
7 age_65 1088 1352 2440 27.7
For attribution, please cite this work as
유충현 (2022, Feb. 25). Dataholic: Wrangle data with tidyr. Retrieved from https://choonghyunryu.github.io/2022-02-25-tidyr
BibTeX citation
@misc{유충현2022wrangle, author = {유충현, }, title = {Dataholic: Wrangle data with tidyr}, url = {https://choonghyunryu.github.io/2022-02-25-tidyr}, year = {2022} }