提问人:Maike Wuerzburg 提问时间:11/12/2023 最后编辑:Maike Wuerzburg 更新时间:11/14/2023 访问量:48
获取拟合帕累托分布和正态分布的参数
Getting Parameters of fitted pareto and normal distributions
问:
我想将正态分布和帕累托分布拟合到我收集的一些数据中。我将在下面插入所有三种类型的配件的代码。您会注意到它们非常相似,因为配件的概念是相同的。 我在每个代码中将数据作为向量插入。
我有三种类型的配件: (1)我的数据和正态分布 (2)我的数据和帕累托分布(其中我使用数据中的最小值作为起点,因为帕累托分布只能是正的) (3)我的数据和帕累托分布(其中我只使用绝对值,这样我就没有负数据)
我的数据几乎呈正态分布,因此很明显,帕累托分布与我的数据不匹配。但是,我仍然必须为我的数据找到最佳的“拟合”帕累托分布。
我的问题是,我最终得到的参数与我一开始使用的参数部分相同。我以为配件后参数会有所不同。一开始,我将参数设置为起点。在 (3) 中,值不同,但在 (1) 和 (2) 中,它们没有。
如果您有任何想法,为什么(1)和(2)中的参数在安装后没有改变,我将不胜感激!
这是我的 R 代码 (1): 我的数据和正态分布
rm(list = ls()) # Clear environment
cat("\014") # Clear console
#install.packages("tidyverse")
#install.packages("optimization")
#install.packages("moments")
#install.packages("ptsuite")
#install.packages("ggpubr")
library(nloptr)
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(ptsuite)
library(ggpubr)
library(readxl)
# My data and normal distribution -------------------------------------------------
simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting
# Fix random process for reproducible results -----------------------------
set.seed(1)
calculate.normal <- function(x) {
para.cases <- x[1]
para.steps <- x[2]
para.mean <- x[3]
para.sd <- x[4]
normal.vector <- c(para.cases) #Result of the Function, e.g. Vector with 20 bins/cases
for (i in 1:para.cases) {
x<-para.mean-3.0*para.sd+((i-1)*para.steps)
normal.vector[i] <- (1/(sqrt(2*pi*para.sd^2))*exp((-1/2)*((x-para.mean)/para.sd)^2))
i <- i + para.steps
}
return(normal.vector)
}
# generate data base ------------------------------------------------------
# Your set of values
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
-0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
-2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
-1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
-1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
-1.39227730, 1.45017758)
#Create output vectors
simulation.vector<-c()
data.density<-c()
# generate frequency distributions ----------------------------------------
#generate frequency distributions. trunc und ceiling make sure that all values are in the interval (round leads to exclusion)
histogram.absolute<-hist(data.vector, breaks=seq(floor(min(data.vector)),ceiling(max(data.vector)),l=number.bins),
col='lightgrey', xlab = "Score", face= "bold",
ylab = "Fequency", main = "", font.lab = 2,
cex.lab = 1.5, xlim = c(-5, 7), ylim = c(0, 30))
axis(side=1, at=seq(-5, 6, by=1), lwd.ticks = 1)
axis(side=2, at=seq( 0, 30, by=5), lwd.ticks = 1)
# extract case numbers for calculating percentages
#we extract the counts for each bin of the actual data
cases.data<-sum(histogram.absolute$counts)
data.density.p<-histogram.absolute$counts/cases.data
#Make a Histogram of relative frequencies
barplot(data.density.p, col='blue', xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
#axis(side=1, at=seq( 0, 26, by=1), lwd.ticks = 1)
#axis(side=2, lwd.ticks = 1, labels = c("hello", "0.05", "0.10", "0.15", "0.20", "0.25", "0.30", "0.35"))
view(data.density.p)
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#Define function
Fit.To.normal <- function(x, data.density.p) {
para.steps <- x[1]
para.mean <- x[2]
para.sd <- x[3]
daten <- data.density.p
simulated.vector <- calculate.normal(c(20, para.steps, para.mean, para.sd))
#generate a PERFECT normal distribution with given parameters (but 20 bins fixed, why?)
simulated.vector.p <- simulated.vector / sum(simulated.vector)
# obtain the relative frequencies of the above PERFECT Pareto distribution
delta <- (daten - simulated.vector.p)
#difference between real data and simulated function (data.density.p, with fixed shape and scale)
delta.sq <- delta^2
SRMSE <- sqrt(mean(delta.sq))
#view(mean(data.density.p))
#view(mean(simulated.vector.p))
barplot(data.density.p, col='blue', xlim=c(0,23), space = 0, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
#axis(side=1)
barplot(simulated.vector.p, col = 'red', add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5) #red is simulated distribution, blue is real data
barplot(data.density.p, col='blue', xlim=c(0,23), add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
#axis(side = 1, at = seq(0,23,1)#seq_along(data.density.p)
#, tick = TRUE)
return(SRMSE)
}
view(data.density.p)
#Fitting
passung <- optim(c(1, 1, 1),
fn = Fit.To.normal,
data.density.p = data.density.p,
method = "L-BFGS-B",
lower = c(1, 0.1, 0.1),
upper = c(10, 10, 10))
print(passung) >
这是我的 R 代码 (2):我的数据和帕累托分布
rm(list = ls()) # Clear environment
cat("\014") # Clear console
#install.packages("tidyverse")
#install.packages("optimization")
#install.packages("moments")
#install.packages("ptsuite")
#install.packages("ggpubr")
#install.packages("patternplot")
library(nloptr)
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(patternplot)
library(ptsuite)
library(ggpubr)
library(readxl)
# my data and pareto distribution -------------------------------------------------------------
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
-0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
-2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
-1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
-1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
-1.39227730, 1.45017758)
data.pareto <- data.vector + abs(min(data.vector)) + 1
#view(data.pareto)
pareto_test(data.pareto)
# size and bin definition ----------------------------------------------------------
simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting
# Fix random process for reproducible results -----------------------------
#Making a "perfect" Pareto-distribution
#para.cases: how man cases
#para.steps: how fine the distribution should be
#para.shape: shape
#para.scale: xmin
#need %, not absolute frequency
#Function is protected from outer vectors
set.seed(1)
calculate.pareto <- function(x) {
para.cases <- x[1]
para.steps <- x[2]
para.shape <- x[3]
para.scale <- x[4]
pareto.vector <- c(para.cases) #Vector with 20 bins
for (i in 1:para.cases) {
x<-para.scale+((i-1)*para.steps)
pareto.vector[i] <- (para.shape * para.scale^para.shape) / (x^(para.shape+1)) #Calculating pareto value and saving it in the second column of the matrix
i <- i + para.steps
}
return(pareto.vector)
#view(calculate.pareto(c(20,20,1,10))) # gibt eine Spalte mit 20 Einträgen, die perfekt Pareto verteilt sind
}
# generate data base --------
hist(data.vector)
# standardize data vector such that the smallest value is 0 ---------------
min.data<-min(data.vector)
max.data<-max(data.vector)
data.vector.adjusted<-data.vector-min.data
hist(data.vector.adjusted)
#haben eine Pareto-Vtlg, die bei 0 anfängt und simuliert ist (nicht perfekt)
#Create output vectors ????
simulation.vector<-c()
data.density<-c()
# generate frequency distributions ----------------------------------------
#generate frequency distributions. trunc und ceiling make sure that all values are in the interval (round leads to exclusion)
histogram.info.data<-hist(data.vector.adjusted, breaks=seq(trunc(min(data.vector.adjusted)),ceiling(max(data.vector.adjusted)),l=number.bins),
col='green')
# extract case numbers for calculating percentages
cases.data<-sum(histogram.info.data$counts)
#we extract the counts for each bin of the actual data
data.density.p<-histogram.info.data$counts/cases.data
barplot(data.density.p, col='blue')
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#data.density is the relative distribution of a unreal pareto distribution
# Define Fitting Function -----------------------------------------------------------------
Fit.To.Pareto <- function(x, data.density.p) {
para.steps <- x[1]
para.shape <- x[2]
para.scale <- x[3]
daten <- data.density.p
# generate a PERFECT pareto distribution with given parameters ------------
simulated.vector <- calculate.pareto(c(20, para.steps, para.shape, para.scale))
#generate a PERFECT pareto distribution with given parameters
simulated.vector.p <- simulated.vector / sum(simulated.vector)
# obtain the relative frequencies of the above PERFECT Pareto distribution
# difference of simulated and real data ------------------------------------
delta <- (daten - simulated.vector.p)
#difference between real data (data.density.p, with fixed shape and scale)
#and simulated data with the fixed parameters
delta.sq <- delta^2
SRMSE <- sqrt(mean(delta.sq))
barplot(data.density.p, col='blue')
barplot(simulated.vector.p, col = 'red', add=T)
barplot(data.density.p, col='blue', add=T)
barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T)
#axis(side = 1, at = seq_along(data.density.p) - 0.5, tick = TRUE)
return(SRMSE)
}
# Optimization Fitting -----------------------------------------------------
passung <- optim(c(1, 1, 1),
fn = Fit.To.Pareto,
data.density.p = data.density.p,
method = "L-BFGS-B",
lower = c(1, 0.01, 0.1),
upper = c(20, 10, 40)) #needs to be adjusted maybe?
print(passung)
这是我的 R 代码 (3):绝对值的帕累托分布
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(ptsuite)
library(ggpubr)
library(readxl)
library(plotly)
library(psych)
library(ggdendro)
library(dplyr)
library(Metrics)
library(jmv)
library(sqldf)
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
-0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
-2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
-1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
-1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
-1.39227730, 1.45017758)
data.vector.abs <- abs(data.vector)
# Pareto-Test -------------------------------------------------------------
data.pareto.mirror <- data.vector.abs + 1
#view(data.pareto.spiegel)
pareto_test(data.pareto.mirror)
hist(data.vector.abs, breaks = 20, xlab = "score", face= "bold",
ylab = "frequency", main = "", font.lab = 2,
cex.lab = 1.5, col = "grey", xlim = c(0, 6), ylim = c(0, 30))
axis(side=1, at=seq(0, 6, by=1), lwd.ticks = 1)
axis(side=2, at=seq( 0, 30, by=5), lwd.ticks = 1)
# Builing a distribution ------------------------------------------------------------
#from Pareto-Fitting
simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting
# Fix random process for reproducible results -----------------------------
#Making a "perfect" Pareto-distribution
#para.cases: how man cases
#para.steps: how fine the distribution should be
#para.shape: shape
#para.scale: xmin
#need %, not absolute frequency
#Function is protected from outer vectors
set.seed(1)
calculate.pareto <- function(x) {
para.cases <- x[1]
para.steps <- x[2]
para.shape <- x[3]
para.scale <- x[4]
pareto.vector <- c(para.cases) #Vector with 20 bins
for (i in 1:para.cases) {
x<-para.scale+((i-1)*para.steps)
pareto.vector[i] <- (para.shape * para.scale^para.shape) / (x^(para.shape+1)) # Calculationg pareto value und save it in the second coloumn of the matrix
i <- i + para.steps
}
return(pareto.vector)
}
# generate data base --------
data.vector<- data.vector.abs
# standardize data vector such that the smallest value is 0 ---------------
min.data<-min(data.vector)
max.data<-max(data.vector)
data.vector.adjusted<-data.vector-min.data
hist(data.vector.adjusted)
simulation.vector<-c()
data.density<-c()
# generate frequency distributions ----------------------------------------
#generate frequency distributions. trunc und ceiling make sure that all values are in the interval (round leads to exclusion)
histogram.info.data<-hist(data.vector.adjusted, breaks=seq(trunc(min(data.vector.adjusted)),ceiling(max(data.vector.adjusted)),l=number.bins),
col='lightgrey', xlab = "score", face= "bold",
ylab = "frequency", main = "", font.lab = 2,
cex.lab = 1.5)
#view(data.vector.adjusted)
#view(histogram.info.data)
# extract case numbers for calculating percentages
cases.data<-sum(histogram.info.data$counts)
#we extract the counts for each bin of the actual data
data.density.p<-histogram.info.data$counts/cases.data
barplot(data.density.p, col='blue')
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#data.density is the relative distribution of a unreal pareto distribution
# Define Fitting Function -----------------------------------------------------------------
Fit.To.Pareto <- function(x, data.density.p) {
para.steps <- x[1]
para.shape <- x[2]
para.scale <- x[3]
daten <- data.density.p
# generate a PERFECT pareto distribution with given parameters ------------
simulated.vector <- calculate.pareto(c(20, para.steps, para.shape, para.scale))
#generate a PERFECT pareto distribution with given parameters
simulated.vector.p <- simulated.vector / sum(simulated.vector)
# obtain the relative frequencies of the above PERFECT Pareto distribution
# difference of simulated and real data ------------------------------------
delta <- (daten - simulated.vector.p)
#difference between real data (data.density.p, with fixed shape and scale)
#and simulated data with the fixed parameters
delta.sq <- delta^2
SRMSE <- sqrt(mean(delta.sq))
barplot(data.density.p, col='blue')
barplot(simulated.vector.p, col = 'red', add=T)
barplot(data.density.p, col='blue', add=T)
barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T)
return(SRMSE)
}
# Optimization Fitting -----------------------------------------------------
passung <- optim(c(1, 1, 1),
fn = Fit.To.Pareto,
data.density.p = data.density.p,
method = "L-BFGS-B",
lower = c(1, 0.1, 0.1),
upper = c(10, 10, 10)) #needs to be adjusted maybe?
print(passung)
答: 暂无答案
评论