获取拟合帕累托分布和正态分布的参数

Getting Parameters of fitted pareto and normal distributions

提问人:Maike Wuerzburg 提问时间:11/12/2023 最后编辑:Maike Wuerzburg 更新时间:11/14/2023 访问量:48

问:

我想将正态分布和帕累托分布拟合到我收集的一些数据中。我将在下面插入所有三种类型的配件的代码。您会注意到它们非常相似,因为配件的概念是相同的。 我在每个代码中将数据作为向量插入。

我有三种类型的配件: (1)我的数据和正态分布 (2)我的数据和帕累托分布(其中我使用数据中的最小值作为起点,因为帕累托分布只能是正的) (3)我的数据和帕累托分布(其中我只使用绝对值,这样我就没有负数据)

我的数据几乎呈正态分布,因此很明显,帕累托分布与我的数据不匹配。但是,我仍然必须为我的数据找到最佳的“拟合”帕累托分布。

我的问题是,我最终得到的参数与我一开始使用的参数部分相同。我以为配件后参数会有所不同。一开始,我将参数设置为起点。在 (3) 中,值不同,但在 (1) 和 (2) 中,它们没有。

如果您有任何想法,为什么(1)和(2)中的参数在安装后没有改变,我将不胜感激!

这是我的 R 代码 (1): 我的数据和正态分布

rm(list = ls()) # Clear environment
cat("\014")  # Clear console

#install.packages("tidyverse")
#install.packages("optimization")
#install.packages("moments")
#install.packages("ptsuite")
#install.packages("ggpubr")

library(nloptr)
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(ptsuite)
library(ggpubr)
library(readxl)

# My data and normal distribution -------------------------------------------------

simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting

# Fix random process for reproducible results -----------------------------

set.seed(1)

calculate.normal <- function(x) {
  para.cases <- x[1]
  para.steps <- x[2]
  para.mean <- x[3]
  para.sd <- x[4]
  
  normal.vector <- c(para.cases) #Result of the Function, e.g. Vector with 20 bins/cases
  
  for (i in 1:para.cases) {
    x<-para.mean-3.0*para.sd+((i-1)*para.steps)
    normal.vector[i] <- (1/(sqrt(2*pi*para.sd^2))*exp((-1/2)*((x-para.mean)/para.sd)^2))
        i <- i + para.steps
  } 
  
  return(normal.vector)
}

# generate data base ------------------------------------------------------
# Your set of values
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
            -0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
            -2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
            1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
            0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
            3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
            0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
            -1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
            -1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
            -1.39227730, 1.45017758)

#Create output vectors

simulation.vector<-c()
data.density<-c()

# generate frequency distributions ----------------------------------------

#generate frequency distributions. trunc und ceiling make sure that all values are in the interval (round leads to exclusion)

histogram.absolute<-hist(data.vector, breaks=seq(floor(min(data.vector)),ceiling(max(data.vector)),l=number.bins), 
                          col='lightgrey', xlab = "Score", face= "bold", 
                         ylab = "Fequency", main = "", font.lab = 2, 
                         cex.lab = 1.5, xlim = c(-5, 7), ylim = c(0, 30))
axis(side=1, at=seq(-5, 6, by=1), lwd.ticks = 1)
axis(side=2, at=seq( 0, 30, by=5), lwd.ticks = 1)

# extract case numbers for calculating percentages
#we extract the counts for each bin of the actual data
cases.data<-sum(histogram.absolute$counts)
data.density.p<-histogram.absolute$counts/cases.data

#Make a Histogram of relative frequencies
barplot(data.density.p, col='blue', xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35), 
        las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2, 
        cex.lab = 1.4, cex.axis = 0.5)
#axis(side=1, at=seq( 0, 26, by=1), lwd.ticks = 1)
#axis(side=2, lwd.ticks = 1, labels = c("hello", "0.05", "0.10", "0.15", "0.20", "0.25", "0.30", "0.35"))

view(data.density.p)
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)

#Define function
Fit.To.normal <- function(x, data.density.p) {
  para.steps <- x[1]
  para.mean <- x[2]
  para.sd <- x[3]
  
  daten <- data.density.p
  
  simulated.vector <- calculate.normal(c(20, para.steps, para.mean, para.sd)) 
  #generate a PERFECT normal distribution with given parameters (but 20 bins fixed, why?)
  simulated.vector.p <- simulated.vector / sum(simulated.vector)
  # obtain the relative frequencies of the above PERFECT Pareto distribution

  delta <- (daten - simulated.vector.p) 
  #difference between real data and simulated function (data.density.p, with fixed shape and scale) 
  delta.sq <- delta^2
  
  SRMSE <- sqrt(mean(delta.sq))
  #view(mean(data.density.p))
  #view(mean(simulated.vector.p))
  barplot(data.density.p, col='blue', xlim=c(0,23), space = 0, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35), 
          las = 1, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2, 
          cex.lab = 1.4, cex.axis = 0.5)
  #axis(side=1)
  barplot(simulated.vector.p, col = 'red', add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35), 
          las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2, 
          cex.lab = 1.4, cex.axis = 0.5) #red is simulated distribution, blue is real data
  barplot(data.density.p, col='blue', xlim=c(0,23), add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35), 
          las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2, 
          cex.lab = 1.4, cex.axis = 0.5)
  barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35), 
          las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2, 
          cex.lab = 1.4, cex.axis = 0.5)
  #axis(side = 1, at = seq(0,23,1)#seq_along(data.density.p)
        #, tick = TRUE)
  return(SRMSE)
}

view(data.density.p)
#Fitting

passung <- optim(c(1, 1, 1),
              fn = Fit.To.normal,
              data.density.p = data.density.p,
              method = "L-BFGS-B",
              lower = c(1, 0.1, 0.1),
              upper = c(10, 10, 10))
print(passung) >

这是我的 R 代码 (2):我的数据和帕累托分布

rm(list = ls()) # Clear environment
cat("\014")  # Clear console

#install.packages("tidyverse")
#install.packages("optimization")
#install.packages("moments")
#install.packages("ptsuite")
#install.packages("ggpubr")
#install.packages("patternplot")

library(nloptr)
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(patternplot)
library(ptsuite)
library(ggpubr)
library(readxl)


# my data and pareto distribution -------------------------------------------------------------
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
                 -0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
                 -2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
                 1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
                 0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
                 3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
                 0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
                 -1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
                 -1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
                 -1.39227730, 1.45017758)

data.pareto <- data.vector + abs(min(data.vector)) + 1
#view(data.pareto)
pareto_test(data.pareto)

# size and bin definition ----------------------------------------------------------

simulations<-5000 #Set desired size of simulated distribution for fitting 
number.bins<-21 #Set number of bins for fitting

# Fix random process for reproducible results -----------------------------
#Making a "perfect" Pareto-distribution

#para.cases: how man cases
#para.steps: how fine the distribution should be
#para.shape: shape
#para.scale: xmin
#need %, not absolute frequency
#Function is protected from outer vectors

set.seed(1)

calculate.pareto <- function(x) {
  para.cases <- x[1]
  para.steps <- x[2]
  para.shape <- x[3]
  para.scale <- x[4]
  
  pareto.vector <- c(para.cases) #Vector with 20 bins
  
  for (i in 1:para.cases) {
    x<-para.scale+((i-1)*para.steps)
    pareto.vector[i] <- (para.shape * para.scale^para.shape) / (x^(para.shape+1)) #Calculating pareto value and saving it in the second column of the matrix
        i <- i + para.steps
  }
  
  return(pareto.vector)
  #view(calculate.pareto(c(20,20,1,10))) # gibt eine Spalte mit 20 Einträgen, die perfekt Pareto verteilt sind
}

# generate data base  --------
hist(data.vector)

# standardize data vector such that the smallest value is 0 ---------------

min.data<-min(data.vector)
max.data<-max(data.vector)

data.vector.adjusted<-data.vector-min.data
hist(data.vector.adjusted)
#haben eine Pareto-Vtlg, die bei 0 anfängt und simuliert ist (nicht perfekt)

#Create output vectors ????

simulation.vector<-c()
data.density<-c()

# generate frequency distributions ----------------------------------------

#generate frequency distributions. trunc und ceiling make sure that all values are in the interval (round leads to exclusion)

histogram.info.data<-hist(data.vector.adjusted, breaks=seq(trunc(min(data.vector.adjusted)),ceiling(max(data.vector.adjusted)),l=number.bins), 
                          col='green')

# extract case numbers for calculating percentages
cases.data<-sum(histogram.info.data$counts)

#we extract the counts for each bin of the actual data
data.density.p<-histogram.info.data$counts/cases.data
barplot(data.density.p, col='blue')
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#data.density is the relative distribution of a unreal pareto distribution


# Define Fitting Function -----------------------------------------------------------------

Fit.To.Pareto <- function(x, data.density.p) {
  para.steps <- x[1] 
  para.shape <- x[2] 
  para.scale <- x[3]
  
  daten <- data.density.p
  

# generate a PERFECT pareto distribution with given parameters ------------

  simulated.vector <- calculate.pareto(c(20, para.steps, para.shape, para.scale)) 
  #generate a PERFECT pareto distribution with given parameters
  simulated.vector.p <- simulated.vector / sum(simulated.vector)
  # obtain the relative frequencies of the above PERFECT Pareto distribution


# difference of simulated and real data ------------------------------------

  delta <- (daten - simulated.vector.p) 
  #difference between real data (data.density.p, with fixed shape and scale) 
  #and simulated data with the fixed parameters
  delta.sq <- delta^2
  
  SRMSE <- sqrt(mean(delta.sq))
  barplot(data.density.p, col='blue')
  barplot(simulated.vector.p, col = 'red', add=T)
  barplot(data.density.p, col='blue', add=T)
  barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T)
  #axis(side = 1, at = seq_along(data.density.p) - 0.5, tick = TRUE)
  return(SRMSE)
}



# Optimization Fitting -----------------------------------------------------

passung <- optim(c(1, 1, 1),
              fn = Fit.To.Pareto,
              data.density.p = data.density.p,
              method = "L-BFGS-B",
              lower = c(1, 0.01, 0.1),
              upper = c(20, 10, 40)) #needs to be adjusted maybe?
print(passung)

这是我的 R 代码 (3):绝对值的帕累托分布


library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(ptsuite)
library(ggpubr)
library(readxl)
library(plotly)
library(psych)
library(ggdendro)
library(dplyr)
library(Metrics)
library(jmv)
library(sqldf)

data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
                 -0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
                 -2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
                 1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
                 0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
                 3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
                 0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
                 -1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
                 -1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
                 -1.39227730, 1.45017758)
data.vector.abs <- abs(data.vector)

# Pareto-Test -------------------------------------------------------------


data.pareto.mirror <- data.vector.abs + 1
#view(data.pareto.spiegel)
pareto_test(data.pareto.mirror)

hist(data.vector.abs, breaks = 20, xlab = "score", face= "bold", 
     ylab = "frequency", main = "", font.lab = 2, 
     cex.lab = 1.5, col = "grey", xlim = c(0, 6), ylim = c(0, 30))
axis(side=1, at=seq(0, 6, by=1), lwd.ticks = 1)
axis(side=2, at=seq( 0, 30, by=5), lwd.ticks = 1)

# Builing a distribution ------------------------------------------------------------
#from Pareto-Fitting
simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting

# Fix random process for reproducible results -----------------------------
#Making a "perfect" Pareto-distribution

#para.cases: how man cases
#para.steps: how fine the distribution should be
#para.shape: shape
#para.scale: xmin
#need %, not absolute frequency
#Function is protected from outer vectors

set.seed(1)

calculate.pareto <- function(x) {
  para.cases <- x[1]
  para.steps <- x[2]
  para.shape <- x[3]
  para.scale <- x[4]
  
  pareto.vector <- c(para.cases) #Vector with 20 bins
  
  for (i in 1:para.cases) {
    x<-para.scale+((i-1)*para.steps)
    pareto.vector[i] <- (para.shape * para.scale^para.shape) / (x^(para.shape+1)) # Calculationg pareto value und save it in the second coloumn of the matrix
    i <- i + para.steps
  } 
  
  return(pareto.vector)
}

# generate data base  --------
data.vector<- data.vector.abs

# standardize data vector such that the smallest value is 0 ---------------

min.data<-min(data.vector)
max.data<-max(data.vector)

data.vector.adjusted<-data.vector-min.data
hist(data.vector.adjusted)

simulation.vector<-c()
data.density<-c()

# generate frequency distributions ----------------------------------------

#generate frequency distributions. trunc und ceiling make sure that all values are in the interval (round leads to exclusion)

histogram.info.data<-hist(data.vector.adjusted, breaks=seq(trunc(min(data.vector.adjusted)),ceiling(max(data.vector.adjusted)),l=number.bins), 
                          col='lightgrey', xlab = "score", face= "bold", 
                          ylab = "frequency", main = "", font.lab = 2, 
                          cex.lab = 1.5)
#view(data.vector.adjusted)
#view(histogram.info.data)

# extract case numbers for calculating percentages
cases.data<-sum(histogram.info.data$counts)

#we extract the counts for each bin of the actual data
data.density.p<-histogram.info.data$counts/cases.data
barplot(data.density.p, col='blue')
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#data.density is the relative distribution of a unreal pareto distribution


# Define Fitting Function  -----------------------------------------------------------------

Fit.To.Pareto <- function(x, data.density.p) {
  para.steps <- x[1]
  para.shape <- x[2]
  para.scale <- x[3]
  
  daten <- data.density.p
  
  
  # generate a PERFECT pareto distribution with given parameters ------------
  
  simulated.vector <- calculate.pareto(c(20, para.steps, para.shape, para.scale)) 
  #generate a PERFECT pareto distribution with given parameters
  simulated.vector.p <- simulated.vector / sum(simulated.vector)
  # obtain the relative frequencies of the above PERFECT Pareto distribution
  
  
  # difference of simulated and real data ------------------------------------
  
  delta <- (daten - simulated.vector.p) 
  #difference between real data (data.density.p, with fixed shape and scale) 
  #and simulated data with the fixed parameters
  delta.sq <- delta^2
  
  SRMSE <- sqrt(mean(delta.sq))
  barplot(data.density.p, col='blue')
  barplot(simulated.vector.p, col = 'red', add=T)
  barplot(data.density.p, col='blue', add=T)
  barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T)
  return(SRMSE)
}

# Optimization Fitting -----------------------------------------------------

passung <- optim(c(1, 1, 1),
                 fn = Fit.To.Pareto,
                 data.density.p = data.density.p,
                 method = "L-BFGS-B",
                 lower = c(1, 0.1, 0.1),
                 upper = c(10, 10, 10)) #needs to be adjusted maybe?
print(passung)

R 正态分布 模型拟合 帕累托最优

评论

1赞 Limey 11/13/2023
你已经向我们提供了你的代码,这些代码可以达到你的预期,但你的数据和代码都不起作用。在这种情况下,要提供帮助并不容易......
0赞 Maike Wuerzburg 11/14/2023
感谢 Limey 的反馈!我编辑了问题,并在每个代码的开头插入了所有三个代码和数据作为向量。

答: 暂无答案