提问人:Nerd 提问时间:3/3/2023 更新时间:3/3/2023 访问量:239
在 R 中使用 censReg 的 Tobit 模型的预测函数
Predict function for Tobit model using censReg in R
问:
我想估计一个具有随机效应的 Tobit 模型,然后使用该模型进行预测。 使用 censReg 包可以进行估计,但它不提供预测函数。 我在这里找到了一个用户编写的横截面数据代码,我用它作为编写我自己的函数的起点。我还提到了 Achim Zeileis 对这个问题的回答,以及 censReg 小插曲。
我希望熟悉该软件包和一般过程的人查看我的代码并说它是否合理。
我还在下面提供了一个带有模拟数据的示例。
######################################################################
# Load packages
#####################################################################
library(censReg)
library(plm)
library(ggplot2)
library(gridExtra)
######################################################################
# Write the function
#####################################################################
predict.censReg_tobit <- function(model, data) {
# 1) Write a function for the inverse Mills Ratio
lambda <- function(x) {
lambda <- dnorm(x)/pnorm(x)
lambda
}
# 2) Calculate the predicted latent variable and store the standard deviation (sd) estimate
# The output differs depending on whether a random effects model was estimated or not
# Therefore we need to check the rownames of the output first
if(!is.element("logSigmaMu", rownames(summary(model)$estimate))){ # Was a random effects model (not) estimated?
# If logSigmaMu is not contained, model is a cross-sectional/ pooled regression
# The last estimate contains the log-sd of the error term logSigma
# In order to store the coefficients only, we therefore need to discard the last row
coefficients <- summary(model)$estimate[1:(length(summary(model)$estimate[,1])-1),1]
namesCoef <- names(coefficients)
# Which columns in the original data contain our covariates?
indices <- unlist(lapply(namesCoef[-1], function(x) which(colnames(data)==x)))
# Check if all variables are contained in the data set and predict the latent variable
if (length(indices)==(length(coefficients)-1)) { # Check if all variables are contained in the data set
# If so: store the data that shall be used for prediction in data_pred
data_pred <- cbind(rep(1, nrow(data)), do.call(cbind, lapply(indices, function(x) data[,x])))
# and multiply it with the coefficients to get predictions of the latent variable
latent_var_pred <- (as.matrix(data_pred) %*% coefficients)[,1]
# store the sd estimate
sigma <- exp(summary(model)$estimate[nrow(summary(model)$estimate),1])
} else {
# otherwise show a warning message
warning("The dataset does not contain some of the variables.")
}
# 3) Return the predicted values
# Predict the dependent variable in the testing_set with the trained model
p0 <- pnorm(latent_var_pred/sigma) # probability of a non-zero observation
ey0 <- latent_var_pred + sigma*lambda(latent_var_pred/sigma) # conditional expectation of the censored y given that it is non-zero
predicted <- p0*ey0 # unconditional expectation
predicted
} else {
# LogSigmaNu is the log-sd of the remaining disturbance (that is not attributed to the individual specific effects)
# If a random effects model was estimated, the the last two estimates contain the sd of the individual specific effects and the remaining disturbance
# In order to store the coefficients only, we therefore need to discard the last two rows
coefficients <- summary(model)$estimate[1:(length(summary(model)$estimate[,1])-2),1]
namesCoef <- names(coefficients)
# Which columns in the original data contain our covariates?
indices <- unlist(lapply(namesCoef[-1], function(x) which(colnames(data)==x)))
# Check if all variables are contained in the data set and predict the latent variable
if (length(indices)==(length(coefficients)-1)) { # Check if all variables are contained in the data set
# If yes: store the data that shall be used for prediction in data_pred
data_pred <- cbind(rep(1, nrow(data)), do.call(cbind, lapply(indices, function(x) data[,x])))
# and multiply it with the coefficients to get predictions of the latent variable
latent_var_pred <- (as.matrix(data_pred) %*% coefficients)[,1]
# store the sd estimate
# (in this case: of the remaining sd, i.e. exp(logSigmaNu))
sigma <- exp(summary(model)$estimate[nrow(summary(model)$estimate),1])
} else {
# otherwise show a warning message
warning("The dataset does not contain some of the variables.")
}
# 3) Return the predicted values
# Predict the dependent variable in the testing_set with the trained model
p0 <- pnorm(latent_var_pred/sigma) # probability of a non-zero observation
# lambda <- function(x) dnorm(x)/pnorm(x) # inverse Mills ratio
ey0 <- latent_var_pred + sigma*lambda(latent_var_pred/sigma) # conditional expectation of the censored y given that it is non-zero
predicted <- p0*ey0 # unconditional expectation
predicted
}}
#####################################################################
# Simulating test data
#####################################################################
set.seed(999)
# explanatory variable
x_t0 <- rnorm(1000, mean = 4.5, sd = 2)
x_t1 <- x_t0 + rnorm(1000, mean = 0, sd = 1)
x_t2 <- x_t0 + rnorm(1000, mean = 0, sd = 1)
x_t3 <- x_t0 + rnorm(1000, mean = 0, sd = 1)
# latent dependend variable
y_t0 <- 0.9*x_t0 + rnorm(1000, mean = 0, sd = 0.4)
y_t1 <- 0.9*x_t1 + rnorm(1000, mean = 0, sd = 0.4)
y_t2 <- 0.9*x_t2 + rnorm(1000, mean = 0, sd = 0.4)
y_t3 <- 0.9*x_t3 + rnorm(1000, mean = 0, sd = 0.4)
# combine it all in a data frame
testdata <- data.frame(id = rep(seq(1:1000),4),
time = c(rep(0,1000),rep(1,1000),
rep(2,1000),rep(3,1000)),
y = c(y_t0, y_t1, y_t2, y_t3),
x = c(x_t0, x_t1, x_t2, x_t3))
# truncation
testdata$y[testdata$y < 0] <- 0
# Split testdata into an estimation and a validation data set
testdata_estimation <- testdata[is.element(testdata$id, 1:500),]
testdata_validation <- testdata[is.element(testdata$id, 501:1000),]
# 1) pooled model
model_pooled <- censReg(y ~ x,
data = testdata_estimation, left=0, right= Inf)
exp(model_pooled$estimate[3]) # estimated sd of the disturbance term: 0.3974485
pred_pooled <- predict.censReg_tobit(model_pooled, data = testdata_validation)
# 2) random effects model
testdata_estimation_panel <- pdata.frame(testdata_estimation, index = c("id", "time"))
testdata_validation_panel <- pdata.frame(testdata_validation, index = c("id", "time"))
model_RE <- censReg(y ~ x,
data = testdata_estimation_panel, method = "BHHH",
left = 0, right= +Inf)
exp(model_RE$estimate[4]) # estimated sd of the the remaining disturbance SigmaNu: 0.3974507
pred_RE <- predict.censReg_tobit(model_RE, testdata_validation_panel)
# Plot observed vs. predicted values
## Plot predicted values against true values for different quantiles and save the output as jpg
par(mfrow = c(1,2))
a <- ggplot(data = testdata_estimation,
aes(x=x, y=y)) +
geom_point(size = 0.5) +
xlim(-5, 12) +
ylim(0, 12) +
geom_abline(intercept = 0,slope = 1, col = "red") +
annotate("text", x = 0, y=5,
label = paste("rho = ",format(round(cor(
testdata_validation$y,
pred_pooled,
method = "spearman"),
digits = 3), nsmall = 3))) +
xlab("") +
ylab("Predicted") +
ggtitle("Pooled Model")
b <- ggplot(data = testdata_estimation_panel,
aes(x=x, y=y)) +
geom_point(size = 0.5) +
xlim(-0.5, 12) +
ylim(0, 12) +
geom_abline(intercept = 0,slope = 1, col = "red") +
annotate("text", x = 0, y=5,
label = paste("rho = ", format(round(cor(
testdata_validation_panel$y,
pred_RE,
method = "spearman"),
digits = 3), nsmall = 3))) +
xlab("Observed") +
ylab("") +
ggtitle("Random Effects Model")
# compare plots
grid.arrange(a,b, ncol=2)
答: 暂无答案
评论