在 R 中使用 censReg 的 Tobit 模型的预测函数

Predict function for Tobit model using censReg in R

提问人:Nerd 提问时间:3/3/2023 更新时间:3/3/2023 访问量:239

问:

我想估计一个具有随机效应的 Tobit 模型,然后使用该模型进行预测。 使用 censReg 包可以进行估计,但它不提供预测函数。 我在这里找到了一个用户编写的横截面数据代码,我用它作为编写我自己的函数的起点。我还提到了 Achim Zeileis 对这个问题的回答,以及 censReg 小插曲

我希望熟悉该软件包和一般过程的人查看我的代码并说它是否合理。

我还在下面提供了一个带有模拟数据的示例。

######################################################################
# Load packages
#####################################################################
library(censReg)
library(plm)
library(ggplot2)
library(gridExtra)

######################################################################
# Write the function
#####################################################################

predict.censReg_tobit <- function(model, data) {
  # 1) Write a function for the inverse Mills Ratio
  lambda <- function(x) {
    lambda <- dnorm(x)/pnorm(x)
    lambda
  }
  # 2) Calculate the predicted latent variable and store the standard deviation (sd) estimate  
  # The output differs depending on whether a random effects model was estimated or not
  # Therefore we need to check the rownames of the output first
  if(!is.element("logSigmaMu", rownames(summary(model)$estimate))){ # Was a random effects model (not) estimated?
    # If logSigmaMu is not contained, model is a cross-sectional/ pooled regression
    # The last estimate contains the log-sd of the error term logSigma
    # In order to store the coefficients only, we therefore need to discard the last row 
    coefficients <- summary(model)$estimate[1:(length(summary(model)$estimate[,1])-1),1]
    namesCoef <- names(coefficients)
    # Which columns in the original data contain our covariates?
    indices <- unlist(lapply(namesCoef[-1], function(x) which(colnames(data)==x)))
    # Check if all variables are contained in the data set and predict the latent variable
    if (length(indices)==(length(coefficients)-1)) { # Check if all variables are contained in the data set
      # If so: store the data that shall be used for prediction in data_pred 
      data_pred <- cbind(rep(1, nrow(data)), do.call(cbind, lapply(indices, function(x) data[,x])))
      # and multiply it with the coefficients to get predictions of the latent variable 
      latent_var_pred <- (as.matrix(data_pred) %*% coefficients)[,1]
      # store the sd estimate 
      sigma <- exp(summary(model)$estimate[nrow(summary(model)$estimate),1]) 
        } else {
      # otherwise show a warning message
      warning("The dataset does not contain some of the variables.") 
    }
    # 3) Return the predicted values
      # Predict the dependent variable in the testing_set with the trained model
      p0 <- pnorm(latent_var_pred/sigma) # probability of a non-zero observation
      ey0 <- latent_var_pred + sigma*lambda(latent_var_pred/sigma) # conditional expectation of the censored y given that it is non-zero
      predicted <- p0*ey0 # unconditional expectation
      predicted
        } else {
          # LogSigmaNu is the log-sd of the remaining disturbance (that is not attributed to the individual specific effects)
          # If a random effects model was estimated, the the last two estimates contain the sd of the individual specific effects and the remaining disturbance
          # In order to store the coefficients only, we therefore need to discard the last two rows 
          coefficients <- summary(model)$estimate[1:(length(summary(model)$estimate[,1])-2),1]
          namesCoef <- names(coefficients)
          # Which columns in the original data contain our covariates?
          indices <- unlist(lapply(namesCoef[-1], function(x) which(colnames(data)==x)))
          # Check if all variables are contained in the data set and predict the latent variable
          if (length(indices)==(length(coefficients)-1)) { # Check if all variables are contained in the data set
            # If yes: store the data that shall be used for prediction in data_pred 
            data_pred <- cbind(rep(1, nrow(data)), do.call(cbind, lapply(indices, function(x) data[,x])))
            # and multiply it with the coefficients to get predictions of the latent variable 
            latent_var_pred <- (as.matrix(data_pred) %*% coefficients)[,1]
            # store the sd estimate 
            # (in this case: of the remaining sd, i.e. exp(logSigmaNu)) 
            sigma <- exp(summary(model)$estimate[nrow(summary(model)$estimate),1]) 
        } else {
          # otherwise show a warning message
          warning("The dataset does not contain some of the variables.") 
        }
        # 3) Return the predicted values 
          # Predict the dependent variable in the testing_set with the trained model
          p0 <- pnorm(latent_var_pred/sigma) # probability of a non-zero observation
          # lambda <- function(x) dnorm(x)/pnorm(x) # inverse Mills ratio
          ey0 <- latent_var_pred + sigma*lambda(latent_var_pred/sigma) # conditional expectation of the censored y given that it is non-zero
          predicted <- p0*ey0 # unconditional expectation
          predicted
          }}


#####################################################################
# Simulating test data
#####################################################################
set.seed(999)

# explanatory variable
x_t0 <- rnorm(1000, mean = 4.5, sd = 2)
x_t1 <- x_t0 + rnorm(1000, mean = 0, sd = 1)
x_t2 <- x_t0 + rnorm(1000, mean = 0, sd = 1)
x_t3 <- x_t0 + rnorm(1000, mean = 0, sd = 1)

# latent dependend variable
y_t0 <- 0.9*x_t0 + rnorm(1000, mean = 0, sd = 0.4)
y_t1 <- 0.9*x_t1 + rnorm(1000, mean = 0, sd = 0.4)
y_t2 <- 0.9*x_t2 + rnorm(1000, mean = 0, sd = 0.4)
y_t3 <- 0.9*x_t3 + rnorm(1000, mean = 0, sd = 0.4)

# combine it all in a data frame
testdata <- data.frame(id = rep(seq(1:1000),4),
                       time = c(rep(0,1000),rep(1,1000),
                                rep(2,1000),rep(3,1000)),
                       y = c(y_t0, y_t1, y_t2, y_t3),
                       x = c(x_t0, x_t1, x_t2, x_t3))

# truncation
testdata$y[testdata$y < 0] <- 0

# Split testdata into an estimation and a validation data set
testdata_estimation <- testdata[is.element(testdata$id, 1:500),]
testdata_validation <- testdata[is.element(testdata$id, 501:1000),]

# 1) pooled model
model_pooled <- censReg(y ~ x,
                 data = testdata_estimation, left=0, right= Inf)

exp(model_pooled$estimate[3]) # estimated sd of the disturbance term: 0.3974485
pred_pooled <- predict.censReg_tobit(model_pooled, data = testdata_validation)


# 2) random effects model
testdata_estimation_panel <- pdata.frame(testdata_estimation, index = c("id", "time"))
testdata_validation_panel <- pdata.frame(testdata_validation, index = c("id", "time"))

model_RE <- censReg(y ~ x,
                    data = testdata_estimation_panel, method = "BHHH",
                    left = 0, right= +Inf)

exp(model_RE$estimate[4]) # estimated sd of the the remaining disturbance SigmaNu: 0.3974507
pred_RE <- predict.censReg_tobit(model_RE, testdata_validation_panel)

# Plot observed vs. predicted values
## Plot predicted values against true values for different quantiles and save the output as jpg
par(mfrow = c(1,2))
a <- ggplot(data = testdata_estimation,
            aes(x=x, y=y)) +
  geom_point(size = 0.5) +
  xlim(-5, 12) +
  ylim(0, 12) +
  geom_abline(intercept = 0,slope = 1, col = "red") +
  annotate("text", x = 0, y=5,
           label = paste("rho = ",format(round(cor(
             testdata_validation$y,
             pred_pooled,
             method = "spearman"),
             digits = 3), nsmall = 3))) +
  xlab("") +
  ylab("Predicted") +
  ggtitle("Pooled Model")

b <- ggplot(data = testdata_estimation_panel,
            aes(x=x, y=y)) +
  geom_point(size = 0.5) +
  xlim(-0.5, 12) +
  ylim(0, 12) +
  geom_abline(intercept = 0,slope = 1, col = "red") +
  annotate("text", x = 0, y=5,
           label = paste("rho = ", format(round(cor(
             testdata_validation_panel$y,
             pred_RE,
             method = "spearman"),
             digits = 3), nsmall = 3))) +
  xlab("Observed") +
  ylab("") +
  ggtitle("Random Effects Model")


# compare plots
grid.arrange(a,b, ncol=2)


R 回归 面板 - 数据 截断随机 效应

评论


答: 暂无答案