- Introduction to CDI & Stanford Wordbank
- How to Use the Stanford Wordbank Website
- In-Depth Analysis with wordbankr
items_kor_ws
summary
Vocabulary Norms
Item Trajectories
Vocabulary Norms
Item Trajectories
read_csv("files/wordbank_vocab_data.csv")
Vocabulary Norms
Item Trajectories
Vocabulary Norms
Item Trajectories
read_csv("files/wordbank_item_trajectories.csv")
Cross-Linguistic Trajectories
Semantic Networks
Data Export Tools
Cross-Linguistic Trajectories
Semantic Networks
Data Export Tools
read_csv("files/wordbank_crosslinguistic_mommy.csv") %>% arrange(age)
read_csv("files/wordbank_crosslinguistic_food.csv") %>% arrange(age)
read_csv("files/wordbank_crosslinguistic_eat.csv") %>% arrange(age)
Cross-Linguistic Trajectories
Semantic Networks
Data Export Tools
Cross-Linguistic Trajectories
Semantic Networks
Data Export Tools
Cross-Linguistic Trajectories
Semantic Networks
Data Export Tools
Cross-Linguistic Trajectories
Semantic Networks
Data Export Tools
read_csv("files/wordbank_item_data.csv")
set.seed(1234)
item <- sample(641, 3)
read_csv("files/wordbank_item_data.csv") %>%
filter(item_id %in% item)
read_csv("files/wordbank_item_data.csv") %>%
filter(item_id %in% item) %>%
pivot_longer(cols = c(paste(18:36)), names_to = "age", values_to = "proportion")
read_csv("files/wordbank_item_data.csv") %>%
filter(item_id %in% item) %>%
pivot_longer(cols = c(paste(18:36)), names_to = "age", values_to = "proportion") %>%
ggplot(aes(y = proportion, x = age, col = item_definition)) + geom_point() +
theme_classic(base_family = "NanumGothic") + geom_hline(yintercept = .5, col = "grey40", linetype = "dashed")
> install.packages(“wordbankr”) or
> devtools::install_github(“langcog/wordbankr”)
library(wordbankr)
help(package = "wordbankr")
ls("package:wordbankr")
## [1] "fit_aoa" "fit_vocab_quantiles" ## [3] "get_administration_data" "get_crossling_data" ## [5] "get_crossling_items" "get_instrument_data" ## [7] "get_instruments" "get_item_data" ## [9] "get_sources" "summarise_items"
get_instruments()
get_sources()
get_sources(language = "Korean")
admins_kor_ws <- get_administration_data(language = "Korean", form = "WS") admins_kor_ws
n_distinct(admins_kor_ws$data_id)
## [1] 1370
admins_kor_ws %>% group_by(sex) %>% count(age) %>% spread(sex, n)
inst_kor_ws <- get_instrument_data(language = "Korean", form = "WS") inst_kor_ws
inst_kor_ws %>% inner_join(admins_kor_ws %>% select(data_id, age, sex))
inst_kor_ws %>% inner_join(admins_kor_ws %>% select(data_id, age, sex))
inst_kor_ws %>% inner_join(admins_kor_ws %>% select(data_id, age, sex)) %>% filter(value == "produces" & !is.na(sex))
inst_kor_ws %>% inner_join(admins_kor_ws %>% select(data_id, age, sex)) %>% filter(value == "produces" & !is.na(sex)) %>% group_by(data_id, age, sex)
inst_kor_ws %>% inner_join(admins_kor_ws %>% select(data_id, age, sex)) %>% filter(value == "produces" & !is.na(sex)) %>% group_by(data_id, age, sex) %>% count() %>% rename(production = n) -> data_kor_ws data_kor_ws
ggplot(data_kor_ws, aes(x = age, y = production, col = sex)) + labs(x = "Age (months)", y = "Productive vocabulary size") + theme_classic()
ggplot(data_kor_ws, aes(x = age, y = production, col = sex)) + labs(x = "Age (months)", y = "Productive vocabulary size") + theme_classic() + geom_jitter(size = 0.5)
ggplot(data_kor_ws, aes(x = age, y = production, col = sex)) + labs(x = "Age (months)", y = "Productive vocabulary size") + theme_classic() + geom_jitter(colour = "grey", size = 0.5) + geom_smooth(method = "lm", formula = y ~ splines::ns(x, df = 2))
data_quantiles <- fit_vocab_quantiles( vocab_data = data_kor_ws %>% mutate(language = "Korean", form = "WS"), measure = production, group = sex, quantiles = "standard") data_quantiles
ggplot(data_kor_ws, aes(x = age, y = production)) + labs(x = "Age (months)", y = "Productive vocabulary size")+theme_classic() + facet_wrap(~sex)
ggplot(data_kor_ws, aes(x = age, y = production, col = sex)) + labs(x = "Age (months)", y = "Productive vocabulary size")+theme_classic() + facet_wrap(~sex) + geom_jitter(size = 0.5)
ggplot(data_kor_ws, aes(x = age, y = production)) + labs(x = "Age (months)", y = "Productive vocabulary size")+theme_classic() + facet_wrap(~sex) + geom_jitter(colour = "grey", size = 0.5) + geom_line(data = data_quantiles, aes(y = production, x = age, col = quantile), inherit.aes = F, size = 1)
fit_aoa( inst_kor_ws %>% inner_join(admins_kor_ws %>% select(data_id, age, sex)), measure = "produces", method = "glmrob", proportion = 0.5 ) -> aoa_list aoa_list
aoa_list %>% filter(!is.na(aoa)) -> aoa_list aoa_list
items_kor_ws # from get_item_data()
aoa_list %>% inner_join(items_kor_ws %>% select(num_item_id, definition, uni_lemma)) -> aoa_list aoa_list %>% arrange(aoa)
Use data from Wordbank to explore questions about language learning.
Use data from Wordbank to explore questions about language learning.
items_kor_ws # from get_item_data()
summarise_items(items_kor_ws) -> item_summary item_summary
unique(item_summary$lexical_category)
## [1] "other" "nouns" "function_words" "predicates" ## [5] NA
unique(item_summary$lexical_class)
## [1] "other" "nouns" "function_words" "verbs" ## [5] "adjectives" NA
ggplot(item_summary %>% filter(!is.na(lexical_class)), aes(y = production, x = age)) + theme_classic()
ggplot(item_summary %>% filter(!is.na(lexical_class)), aes(y = production, x = age)) + theme_classic() + geom_jitter(size = .5, col = "grey", alpha = .5)
ggplot(item_summary %>% filter(!is.na(lexical_class)), aes(y = production, x = age, col = lexical_class)) + theme_classic() + geom_jitter(size = .5)
ggplot(item_summary %>% filter(!is.na(lexical_class)), aes(y = production, x = age)) + theme_classic() + geom_jitter(size = .5, col = "grey", alpha = .5) + geom_smooth(method = "lm", formula = y ~ splines::ns(x, df = 2), aes(col = lexical_class))
items <- items_kor_ws %>% filter(category == "games_routines") items
item_summary %>% filter(item_id %in% items$item_id)
item_summary %>% filter(item_id %in% items$item_id) %>% ggplot(aes(y = production, x = age)) + theme_classic(base_family = "NanumGothic") + facet_wrap(~definition, ncol = 5)+ labs(colour="Items")
item_summary %>% filter(item_id %in% items$item_id) %>% ggplot(aes(y = production, x = age)) + theme_classic(base_family = "NanumGothic") + facet_wrap(~definition, ncol = 5)+ labs(colour="Items") + geom_point(col = "grey", alpha = .5)
item_summary %>% filter(item_id %in% items$item_id) %>% ggplot(aes(y = production, x = age)) + theme_classic(base_family = "NanumGothic") + facet_wrap(~definition, ncol = 5)+ labs(colour="Items") + geom_point(col = "grey", alpha = .5) + geom_smooth(method = "lm", formula = y ~ splines::ns(x, df = 2))
item_summary %>% filter(item_id %in% items$item_id) %>% ggplot(aes(y = production, x = age)) + theme_classic(base_family = "NanumGothic") + facet_wrap(~paste(definition, uni_lemma, sep = "_"), ncol = 5)+ labs(colour="Items") + geom_point(col = "grey", alpha = .5) + geom_smooth(method = "lm", formula = y ~ splines::ns(x, df = 2))
Integrative data analysis
Integrative data analysis
full_kor_ws <- get_instrument_data(language = "Korean", form = "WS", administrations = T, iteminfo = T) full_kor_ws
full_kor_ws %>%
dplyr::mutate(produces = !is.na(value) & value ==
"produces",
understands = !is.na(value) &
(value == "understands" | value == "produces"))
full_kor_ws %>%
dplyr::mutate(produces = !is.na(value) & value ==
"produces",
understands = !is.na(value) &
(value == "understands" | value == "produces")) %>%
dplyr::select(age, num_item_id, uni_lemma, category, definition, produces, understands)
full_kor_ws %>%
dplyr::mutate(produces = !is.na(value) & value ==
"produces",
understands = !is.na(value) &
(value == "understands" | value == "produces")) %>%
dplyr::select(age, num_item_id, uni_lemma, category, definition, produces, understands) %>%
tidyr::gather("measure_name", "value", produces, understands)
full_kor_ws %>%
dplyr::mutate(produces = !is.na(value) & value ==
"produces",
understands = !is.na(value) &
(value == "understands" | value == "produces")) %>%
dplyr::select(age, num_item_id, uni_lemma, category, definition, produces, understands) %>%
tidyr::gather("measure_name", "value", produces, understands) %>%
dplyr::filter(measure_name == "produces")
full_kor_ws %>%
dplyr::mutate(produces = !is.na(value) & value ==
"produces",
understands = !is.na(value) &
(value == "understands" | value == "produces")) %>%
dplyr::select(age, num_item_id, uni_lemma, category, definition, produces, understands) %>%
tidyr::gather("measure_name", "value", produces, understands) %>%
dplyr::filter(measure_name == "produces") %>%
dplyr::group_by(age, num_item_id, uni_lemma, category, definition)
full_kor_ws %>%
dplyr::mutate(produces = !is.na(value) & value ==
"produces",
understands = !is.na(value) &
(value == "understands" | value == "produces")) %>%
dplyr::select(age, num_item_id, uni_lemma, category, definition, produces, understands) %>%
tidyr::gather("measure_name", "value", produces, understands) %>%
dplyr::filter(measure_name == "produces") %>%
dplyr::group_by(age, num_item_id, uni_lemma, category, definition) %>%
dplyr::summarise(num_true = sum(value),
num_false = dplyr::n() - num_true) ->
item_data; item_data
item_data %>% filter(definition == "파이팅") -> word_data; word_data
cbind(ages = word_data$age,
data_prop = word_data$num_true/
(word_data$num_true + word_data$num_false)) %>% data.frame()
inv_logit <- function(x) 1/(exp(-x) + 1) ages <- dplyr::tibble(age = c(min(item_data$age):max(item_data$age))) ages
robustbase::glmrob(cbind(num_true, num_false) ~ age,
data = word_data,
family = "binomial")
## ## Call: robustbase::glmrob(formula = cbind(num_true, num_false) ~ age, family = "binomial", data = word_data) ## ## Coefficients: ## (Intercept) age ## -3.9171 0.1513 ## ## Number of observations: 19 ## Fitted by method 'Mqle'
robustbase::glmrob(cbind(num_true, num_false) ~ age,
data = word_data,
family = "binomial") %>%
stats::predict(ages)
## 1 2 3 4 5 6 7 ## -1.1932928 -1.0419722 -0.8906516 -0.7393311 -0.5880105 -0.4366900 -0.2853694 ## 8 9 10 11 12 13 14 ## -0.1340489 0.0172717 0.1685923 0.3199128 0.4712334 0.6225539 0.7738745 ## 15 16 17 18 19 ## 0.9251950 1.0765156 1.2278362 1.3791567 1.5304773
robustbase::glmrob(cbind(num_true, num_false) ~ age,
data = word_data,
family = "binomial") %>%
stats::predict(ages) %>%
inv_logit() -> mod_prop; cbind(ages, mod_prop)
inner_join(cbind(ages, mod_prop),
cbind(ages,
data_prop = word_data$num_true/
(word_data$num_true + word_data$num_false)))
inner_join(cbind(ages, mod_prop),
cbind(ages,
data_prop = word_data$num_true/
(word_data$num_true + word_data$num_false))) %>%
gather(key = "type",
value = "proportion",
c(mod_prop, data_prop)) -> plot_prop; plot_prop
plot_prop %>%
ggplot(aes(y = proportion, x = age, col = type)) + theme_classic(base_family = "NanumGothic") + ggtitle("파이팅") + coord_cartesian(ylim = c(0,1))
plot_prop %>% filter(type == "data_prop") %>%
ggplot(aes(y = proportion, x = age, col = type)) + theme_classic(base_family = "NanumGothic") + ggtitle("파이팅") + coord_cartesian(ylim = c(0,1)) +
geom_point()
plot_prop %>%
ggplot(aes(y = proportion, x = age, col = type)) + theme_classic(base_family = "NanumGothic") + ggtitle("파이팅") + coord_cartesian(ylim = c(0,1)) +
geom_point()
plot_prop %>%
ggplot(aes(y = proportion, x = age, col = type)) + theme_classic(base_family = "NanumGothic") + ggtitle("파이팅") + coord_cartesian(ylim = c(0,1)) +
geom_point() + geom_hline(yintercept = .5, col = "grey70") + geom_vline(xintercept = 26, col = "grey70")
MacArthur-Bates Communicative Development Inventories
– mb-cdi.stanford.edu
Wordbank
– wordbank.stanford.edu
– github.com/langcog/wordbankr
Learning R
- R for Data Science by Hadley Wickham and Garrett Grolemund
- Data Visualization with ggplot2 by Hadley Wickham