The Horvitz-Thompson Estimator
Computes the Horvitz-Thompson estimator of the population total for several variables of interest
HT(y, Pik)
y |
Vector, matrix or data frame containing the recollected information of the variables of interest for every unit in the selected sample |
Pik |
A vector containing the inclusion probabilities for each unit in the selected sample |
The Horvitz-Thompson estimator is given by
∑_{k \in U}\frac{y_k}{{π}_k}
where y_k is the value of the variables of interest for the kth unit, and {π}_k its corresponding inclusion probability. This estimator could be used for without replacement designs as well as for with replacement designs.
The function returns a vector of total population estimates for each variable of interest.
Hugo Andres Gutierrez Rojas hagutierrezro@gmail.com
Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), Model Assisted Survey Sampling. Springer.
Gutierrez, H. A. (2009), Estrategias de muestreo: Diseno de encuestas y estimacion de parametros.
Editorial Universidad Santo Tomas.
############ ## Example 1 ############ # Uses the Lucy data to draw a simple random sample without replacement data(Lucy) attach(Lucy) N <- dim(Lucy)[1] n <- 400 sam <- sample(N,n) # The vector of inclusion probabilities for each unit in the sample pik <- rep(n/N,n) # The information about the units in the sample is stored in an object called data data <- Lucy[sam,] attach(data) names(data) # The variables of interest are: Income, Employees and Taxes # This information is stored in a data frame called estima estima <- data.frame(Income, Employees, Taxes) HT(estima, pik) ############ ## Example 2 ############ # Uses the Lucy data to draw a simple random sample with replacement data(Lucy) N <- dim(Lucy)[1] m <- 400 sam <- sample(N,m,replace=TRUE) # The vector of selection probabilities of units in the sample pk <- rep(1/N,m) # Computation of the inclusion probabilities pik <- 1-(1-pk)^m # The information about the units in the sample is stored in an object called data data <- Lucy[sam,] attach(data) names(data) # The variables of interest are: Income, Employees and Taxes # This information is stored in a data frame called estima estima <- data.frame(Income, Employees, Taxes) HT(estima, pik) ############ ## Example 3 ############ # Without replacement sampling # Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") # Vector y1 and y2 are the values of the variables of interest y1<-c(32, 34, 46, 89, 35) y2<-c(1,1,1,0,0) y3<-cbind(y1,y2) # The population size is N=5 N <- length(U) # The sample size is n=2 n <- 2 # The sample membership matrix for fixed size without replacement sampling designs Ind <- Ik(N,n) # p is the probability of selection of every possible sample p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) # Computation of the inclusion probabilities inclusion <- Pik(p, Ind) # Selection of a random sample sam <- sample(5,2) # The selected sample U[sam] # The inclusion probabilities for these two units inclusion[sam] # The values of the variables of interest for the units in the sample y1[sam] y2[sam] y3[sam,] # The Horvitz-Thompson estimator HT(y1[sam],inclusion[sam]) HT(y2[sam],inclusion[sam]) HT(y3[sam,],inclusion[sam]) ############ ## Example 4 ############ # Following Example 3... With replacement sampling # The population size is N=5 N <- length(U) # The sample size is m=2 m <- 2 # pk is the probability of selection of every single unit pk <- c(0.9, 0.025, 0.025, 0.025, 0.025) # Computation of the inclusion probabilities pik <- 1-(1-pk)^m # Selection of a random sample with replacement sam <- sample(5,2, replace=TRUE, prob=pk) # The selected sample U[sam] # The inclusion probabilities for these two units inclusion[sam] # The values of the variables of interest for the units in the sample y1[sam] y2[sam] y3[sam,] # The Horvitz-Thompson estimator HT(y1[sam],inclusion[sam]) HT(y2[sam],inclusion[sam]) HT(y3[sam,],inclusion[sam]) #################################################################### ## Example 5 HT is unbiased for without replacement sampling designs ## Fixed sample size #################################################################### # Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") # Vector y1 and y2 are the values of the variables of interest y<-c(32, 34, 46, 89, 35) # The population size is N=5 N <- length(U) # The sample size is n=2 n <- 2 # The sample membership matrix for fixed size without replacement sampling designs Ind <- Ik(N,n) Ind # p is the probability of selection of every possible sample p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) sum(p) # Computation of the inclusion probabilities inclusion <- Pik(p, Ind) inclusion sum(inclusion) # The support with the values of the elements Qy <-Support(N,n,ID=y) Qy # The HT estimates for every single sample in the support HT1<- HT(y[Ind[1,]==1], inclusion[Ind[1,]==1]) HT2<- HT(y[Ind[2,]==1], inclusion[Ind[2,]==1]) HT3<- HT(y[Ind[3,]==1], inclusion[Ind[3,]==1]) HT4<- HT(y[Ind[4,]==1], inclusion[Ind[4,]==1]) HT5<- HT(y[Ind[5,]==1], inclusion[Ind[5,]==1]) HT6<- HT(y[Ind[6,]==1], inclusion[Ind[6,]==1]) HT7<- HT(y[Ind[7,]==1], inclusion[Ind[7,]==1]) HT8<- HT(y[Ind[8,]==1], inclusion[Ind[8,]==1]) HT9<- HT(y[Ind[9,]==1], inclusion[Ind[9,]==1]) HT10<- HT(y[Ind[10,]==1], inclusion[Ind[10,]==1]) # The HT estimates arranged in a vector Est <- c(HT1, HT2, HT3, HT4, HT5, HT6, HT7, HT8, HT9, HT10) Est # The HT is actually desgn-unbiased data.frame(Ind, Est, p) sum(Est*p) sum(y) #################################################################### ## Example 6 HT is unbiased for without replacement sampling designs ## Random sample size #################################################################### # Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") # Vector y1 and y2 are the values of the variables of interest y<-c(32, 34, 46, 89, 35) # The population size is N=5 N <- length(U) # The sample membership matrix for random size without replacement sampling designs Ind <- IkRS(N) Ind # p is the probability of selection of every possible sample p <- c(0.59049, 0.06561, 0.06561, 0.06561, 0.06561, 0.06561, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00009, 0.00009, 0.00009, 0.00009, 0.00009, 0.00001) sum(p) # Computation of the inclusion probabilities inclusion <- Pik(p, Ind) inclusion sum(inclusion) # The support with the values of the elements Qy <-SupportRS(N, ID=y) Qy # The HT estimates for every single sample in the support HT1<- HT(y[Ind[1,]==1], inclusion[Ind[1,]==1]) HT2<- HT(y[Ind[2,]==1], inclusion[Ind[2,]==1]) HT3<- HT(y[Ind[3,]==1], inclusion[Ind[3,]==1]) HT4<- HT(y[Ind[4,]==1], inclusion[Ind[4,]==1]) HT5<- HT(y[Ind[5,]==1], inclusion[Ind[5,]==1]) HT6<- HT(y[Ind[6,]==1], inclusion[Ind[6,]==1]) HT7<- HT(y[Ind[7,]==1], inclusion[Ind[7,]==1]) HT8<- HT(y[Ind[8,]==1], inclusion[Ind[8,]==1]) HT9<- HT(y[Ind[9,]==1], inclusion[Ind[9,]==1]) HT10<- HT(y[Ind[10,]==1], inclusion[Ind[10,]==1]) HT11<- HT(y[Ind[11,]==1], inclusion[Ind[11,]==1]) HT12<- HT(y[Ind[12,]==1], inclusion[Ind[12,]==1]) HT13<- HT(y[Ind[13,]==1], inclusion[Ind[13,]==1]) HT14<- HT(y[Ind[14,]==1], inclusion[Ind[14,]==1]) HT15<- HT(y[Ind[15,]==1], inclusion[Ind[15,]==1]) HT16<- HT(y[Ind[16,]==1], inclusion[Ind[16,]==1]) HT17<- HT(y[Ind[17,]==1], inclusion[Ind[17,]==1]) HT18<- HT(y[Ind[18,]==1], inclusion[Ind[18,]==1]) HT19<- HT(y[Ind[19,]==1], inclusion[Ind[19,]==1]) HT20<- HT(y[Ind[20,]==1], inclusion[Ind[20,]==1]) HT21<- HT(y[Ind[21,]==1], inclusion[Ind[21,]==1]) HT22<- HT(y[Ind[22,]==1], inclusion[Ind[22,]==1]) HT23<- HT(y[Ind[23,]==1], inclusion[Ind[23,]==1]) HT24<- HT(y[Ind[24,]==1], inclusion[Ind[24,]==1]) HT25<- HT(y[Ind[25,]==1], inclusion[Ind[25,]==1]) HT26<- HT(y[Ind[26,]==1], inclusion[Ind[26,]==1]) HT27<- HT(y[Ind[27,]==1], inclusion[Ind[27,]==1]) HT28<- HT(y[Ind[28,]==1], inclusion[Ind[28,]==1]) HT29<- HT(y[Ind[29,]==1], inclusion[Ind[29,]==1]) HT30<- HT(y[Ind[30,]==1], inclusion[Ind[30,]==1]) HT31<- HT(y[Ind[31,]==1], inclusion[Ind[31,]==1]) HT32<- HT(y[Ind[32,]==1], inclusion[Ind[32,]==1]) # The HT estimates arranged in a vector Est <- c(HT1, HT2, HT3, HT4, HT5, HT6, HT7, HT8, HT9, HT10, HT11, HT12, HT13, HT14, HT15, HT16, HT17, HT18, HT19, HT20, HT21, HT22, HT23, HT24, HT25, HT26, HT27, HT28, HT29, HT30, HT31, HT32) Est # The HT is actually desgn-unbiased data.frame(Ind, Est, p) sum(Est*p) sum(y) ################################################################ ## Example 7 HT is unbiased for with replacement sampling designs ################################################################ # Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") # Vector y1 and y2 are the values of the variables of interest y<-c(32, 34, 46, 89, 35) # The population size is N=5 N <- length(U) # The sample size is m=2 m <- 2 # pk is the probability of selection of every single unit pk <- c(0.35, 0.225, 0.175, 0.125, 0.125) # p is the probability of selection of every possible sample p <- p.WR(N,m,pk) p sum(p) # The sample membership matrix for random size without replacement sampling designs Ind <- IkWR(N,m) Ind # The support with the values of the elements Qy <- SupportWR(N,m, ID=y) Qy # Computation of the inclusion probabilities pik <- 1-(1-pk)^m pik # The HT estimates for every single sample in the support HT1 <- HT(y[Ind[1,]==1], pik[Ind[1,]==1]) HT2 <- HT(y[Ind[2,]==1], pik[Ind[2,]==1]) HT3 <- HT(y[Ind[3,]==1], pik[Ind[3,]==1]) HT4 <- HT(y[Ind[4,]==1], pik[Ind[4,]==1]) HT5 <- HT(y[Ind[5,]==1], pik[Ind[5,]==1]) HT6 <- HT(y[Ind[6,]==1], pik[Ind[6,]==1]) HT7 <- HT(y[Ind[7,]==1], pik[Ind[7,]==1]) HT8 <- HT(y[Ind[8,]==1], pik[Ind[8,]==1]) HT9 <- HT(y[Ind[9,]==1], pik[Ind[9,]==1]) HT10 <- HT(y[Ind[10,]==1], pik[Ind[10,]==1]) HT11 <- HT(y[Ind[11,]==1], pik[Ind[11,]==1]) HT12 <- HT(y[Ind[12,]==1], pik[Ind[12,]==1]) HT13 <- HT(y[Ind[13,]==1], pik[Ind[13,]==1]) HT14 <- HT(y[Ind[14,]==1], pik[Ind[14,]==1]) HT15 <- HT(y[Ind[15,]==1], pik[Ind[15,]==1]) # The HT estimates arranged in a vector Est <- c(HT1, HT2, HT3, HT4, HT5, HT6, HT7, HT8, HT9, HT10, HT11, HT12, HT13, HT14, HT15) Est # The HT is actually desgn-unbiased data.frame(Ind, Est, p) sum(Est*p) sum(y)
Please choose more modern alternatives, such as Google Chrome or Mozilla Firefox.