library(dplyr) library(data.table) library(ggplot2) library(grid) library(gridExtra) ### set seed set.seed(123) ## Set up Population (4-Component Normal Mixture) ---------------------- # population size p.size <- 100000 # set components g <- 4 # proportion of components lambda <- gtools::rdirichlet(1,rep(1,g)) # cumulative sum c.lambda <- cumsum(lambda) # component means p.means <- runif(g, -2,2) # component sds p.sd <- runif(g, .3,.5) # random number to select components r.num <- runif(p.size) # select components according to lambda probs which.comp <- findInterval(r.num,c.lambda) + 1 # generate population p.dist <- sapply(which.comp, function (w) { rnorm(1,p.means[w],p.sd[w]) }) # rescale to have mean zero p.dist <- p.dist - mean(p.dist) # graph of population distribution g.p.dist <- p.dist %>% as.data.table %>% setnames('x') %>% ggplot(aes(x)) + geom_histogram(col='white', fill='darkblue', alpha=.6, bins=100) + geom_vline(xintercept=0, linetype=2)+ theme_classic() + ggtitle('Population Distribution') + labs(x=NULL,y=NULL) + lims(x=c(-4,4))+ theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x=element_text(size=15)) # look at distribution g.p.dist ## Set up Functions ---------------------------------------------------- # function to generate plot gen.plot <- function(df, s.size, c.size=1, strat=F, b=100) { df %>% as.data.table %>% ggplot(aes(x=.)) + geom_histogram(col='darkblue', fill='white', bins=b) + geom_vline(xintercept=0, linetype=2)+ geom_vline(xintercept=mean(df), linetype=1)+ theme_classic() + lims(x=c(-4,4))+ labs(x=NULL, y=NULL) + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x=element_text(size=15)) + ggtitle(paste0( ifelse(c.size==1, 'Simple Random Sampling', ifelse(strat, paste0('Stratified Sampling (# Strata = ', c.size, ')'), paste0('Cluster Sampling (# Clusters = ', c.size, ')') ) ), ' with Sample Size ', s.size ) ) } # function to generate simple random sample gen.srs <- function(samp.size) { p.dist[sample(1:p.size, samp.size, replace=F)] } # function for cluster sampling gen.crs <- function(num.c) { ind <- p.clusters %in% sample(u.clusts, num.c, replace=F) p.dist[ind] } # function for stratified sampling gen.strs <- function(within.samps) { sapply(1:n.strata, function(w) { sample(p.dist[strata==w], within.samps, replace=F) }) %>% c } ## Simple Random Sampling ---------------------------------------------- # sample.size sample.size=10 grid.arrange(g.p.dist, gen.plot(gen.srs(sample.size), s.size=sample.size), ncol=1) ## Central Limit Theorem # number of times to sample n.rep <- 5000 # sample size sample.size=500 m.srs <- sapply(seq_len(n.rep), function(...) gen.srs(sample.size)) %>% colMeans grid.arrange(g.p.dist, gen.plot(m.srs, s.size=sample.size, c.size=1,b=500), ncol=1) ## Cluster Sampling (Worst Case) --------------------------------------- # sort (this is the key!) p.dist <- sort(p.dist) no.clusters <- 100 cluster.sizes <- p.size/no.clusters p.clusters <- rep(1:no.clusters,each=cluster.sizes) u.clusts <- unique(p.clusters) %>% sort p.dist.mat <- cbind(p.dist, p.clusters) p.dist.df <- data.table(p.dist.mat) max.vals <- p.dist.df[,max(p.dist),by=p.clusters] ggplot(p.dist.df, aes(x=p.dist)) + geom_density(fill='darkblue', col='white', alpha=.2) + # geom_vline(xintercept=0, linetype=2)+ geom_vline(xintercept=max.vals[1:(nrow(max.vals)-1),V1], col='black', alpha=.5)+ theme_classic() + ggtitle('Population Distribution') + labs(x=NULL,y=NULL) + lims(x=c(-4,4))+ theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x=element_text(size=15)) n.clust = 10 grid.arrange(g.p.dist, gen.plot(gen.crs(n.clust), c.size=n.clust, s.size=n.clust*cluster.sizes), gen.plot(gen.srs(n.clust*cluster.sizes), s.size=n.clust*cluster.sizes), ncol=1) ## why?? samp.c <- sample(u.clusts, n.clust, replace=F) index <- p.clusters %in% samp.c samp <- p.dist[index] # show different clusters gen.plot(p.dist[p.clusters==samp.c[2]], s.size=n.clust*cluster.sizes, c.size=10) ## Central Limit Theorem n.rep <- 5000 # cluster size (vary cluster size) n.clust=20 m.srs <- sapply(seq_len(n.rep), function(...) gen.srs(n.clust*cluster.sizes)) %>% colMeans m.crs <- sapply(seq_len(n.rep), function(...) gen.crs(n.clust)) %>% colMeans grid.arrange(g.p.dist, gen.plot(m.crs, s.size=n.clust*cluster.sizes, c.size=n.clust), gen.plot(m.srs, s.size=n.clust*cluster.sizes, c.size=1, b=500), ncol=1) ### Cluster Sampling (Best case scenario) ------------------------------- # reshuffle (again, this is the key!) p.dist <- p.dist[sample(seq_along(p.dist), length(p.dist), replace=F) ] no.clusters <- 100 cluster.sizes <- p.size/no.clusters p.clusters <- rep(1:no.clusters,each=cluster.sizes) u.clusts <- unique(p.clusters) %>% sort p.dist.mat <- cbind(p.dist, p.clusters) p.dist.df <- data.table(p.dist.mat) max.vals <- p.dist.df[,c(min(p.dist),max(p.dist)),by=p.clusters] ggplot(p.dist.df, aes(x=p.dist)) + geom_density(fill='darkblue', col='white', alpha=.2) + # geom_vline(xintercept=0, linetype=2)+ geom_vline(xintercept=max.vals[,V1], col='black', alpha=.5)+ theme_classic() + ggtitle('Population Distribution') + labs(x=NULL,y=NULL) + lims(x=c(-4,4))+ theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x=element_text(size=15)) # compare n.clust = 10 grid.arrange(g.p.dist, gen.plot(gen.crs(n.clust), c.size=n.clust, s.size=n.clust*cluster.sizes), gen.plot(gen.srs(n.clust*cluster.sizes), s.size=n.clust*cluster.sizes), ncol=1) ## why? (show a list of different clusters) samp.c <- sample(u.clusts, n.clust, replace=F) index <- p.clusters %in% samp.c samp <- p.dist[index] gen.plot(p.dist[p.clusters==samp.c[3]], s.size=n.clust*cluster.sizes, c.size=10) ## Central Limit Theorem (best case) n.rep <- 5000 # sample size n.clust=10 m.srs <- sapply(seq_len(n.rep), function(...) gen.srs(n.clust*cluster.sizes)) %>% colMeans m.crs <- sapply(seq_len(n.rep), function(...) gen.crs(n.clust)) %>% colMeans grid.arrange(g.p.dist, gen.plot(m.crs, s.size=n.clust*cluster.sizes, c.size=n.clust, b=500), gen.plot(m.srs, s.size=n.clust*cluster.sizes, c.size=1, b=500), ncol=1) ## Stratified Sampling (Best case) ------------------------------------- # again sort them! p.dist <- sort(p.dist) # define strata n.strata <- 50 n.within <- p.size/n.strata strata <- rep(1:n.strata, each=n.within) p.dist.mat <- cbind(p.dist, strata) p.dist.df <- data.table(p.dist.mat) max.vals <- p.dist.df[,max(p.dist),by=strata] ggplot(p.dist.df, aes(x=p.dist)) + geom_density(fill='darkblue', col='white', alpha=.2) + # geom_vline(xintercept=0, linetype=2)+ geom_vline(xintercept=max.vals[1:(nrow(max.vals)-1),V1], col='black', alpha=.5)+ theme_classic() + ggtitle('Population Distribution') + labs(x=NULL,y=NULL) + lims(x=c(-4,4))+ theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x=element_text(size=15)) # show some values within.samps <- 20 grid.arrange(g.p.dist, gen.plot(gen.strs(within.samps), c.size=n.strata, s.size=n.strata*within.samps, strat=T), gen.plot(gen.srs(n.strata*within.samps), s.size=n.strata*within.samps), ncol=1) ## why?? # show different strata gen.plot(p.dist[strata==50], s.size=n.strata*within.samps, c.size=n.strata, strat=T) ## Central Limit Theorem n.rep <- 5000 withins.samps <- 1 m.srs <- sapply(seq_len(n.rep), function(...) gen.srs(n.strata*within.samps)) %>% colMeans m.strs <- sapply(seq_len(n.rep), function(...) gen.strs(within.samps)) %>% colMeans grid.arrange(g.p.dist, gen.plot(m.strs, s.size=n.strata*within.samps, c.size=n.strata, strat=T, b=500), gen.plot(m.srs, s.size=n.strata*within.samps, c.size=1, b=500), ncol=1) ### Stratified Sampling (worst-case scenario) -------------------------- # reshuffle (again, this is the key!) p.dist <- p.dist[sample(seq_along(p.dist), length(p.dist), replace=F) ] # define strata n.strata <- 50 n.within <- p.size/n.strata strata <- rep(1:n.strata, each=n.within) p.dist.mat <- cbind(p.dist, strata) p.dist.df <- data.table(p.dist.mat) max.vals <- p.dist.df[,c(min(p.dist),max(p.dist)),by=strata] ggplot(p.dist.df, aes(x=p.dist)) + geom_density(fill=viridis::viridis(1, begin=.5), col='white', alpha=.2) + geom_vline(xintercept=max.vals[,V1], col='black', alpha=.5)+ theme_classic() + ggtitle('Population Distribution') + labs(x=NULL,y=NULL) + lims(x=c(-4,4))+ theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x=element_text(size=15)) # show some values within.samps <- 2 grid.arrange(g.p.dist, gen.plot(gen.strs(within.samps), c.size=n.strata, s.size=n.strata*within.samps, strat=T), gen.plot(gen.srs(n.strata*within.samps), s.size=n.strata*within.samps), ncol=1) ## Central Limit Theorem n.rep <- 5000 withins.samps <- 1 m.srs <- sapply(seq_len(n.rep), function(...) gen.srs(n.strata*within.samps)) %>% colMeans m.strs <- sapply(seq_len(n.rep), function(...) gen.strs(within.samps)) %>% colMeans grid.arrange(g.p.dist, gen.plot(m.strs, s.size=n.strata*within.samps, c.size=n.strata, strat=T), gen.plot(m.srs, s.size=n.strata*within.samps, c.size=1), ncol=1)