Click here to return to the main page
Read in simulated data vector (excluded)
# quick and dirty, a truncated normal distribution to work on the solution set
#z <- rnorm(n=3000,mean=0.2)
#z <- data.frame(1:3000,z)
#names(z) <- list("ID","myVar")
#z <- z[z$myVar>0,]
#str(z)
#summary(z$myVar)
Read in own data
setwd("C:/Users/steph/OneDrive - University of Vermont/Documents/Dissertation_Research/StreamCarbon") #Set working directory where data is located
z1 <- read.csv("hannah_hbef_forest_r.csv")
str(z1)
## 'data.frame': 877 obs. of 10 variables:
## $ stream : chr "kineo1" "kineo1" "kineo1" "kineo1" ...
## $ plot : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ID : chr "tree0052" "tree0053" "tree0054" "tree0055" ...
## $ spp : chr "yellow birch" "sugar maple" "yellow birch" "sugar maple" ...
## $ species : chr "BEAL" "ACSA" "BEAL" "ACSA" ...
## $ dbh : num 50.8 13 55.1 18.2 45.5 ...
## $ count : int 1 1 1 1 1 1 1 1 1 1 ...
## $ living : chr "alive" "alive" "alive" "alive" ...
## $ abv_mass_tonn : num 1.5997 0.0688 1.9386 0.156 1.2327 ...
## $ foliage_mass_tonn: num 0.03033 0.00183 0.03642 0.00364 0.02369 ...
summary(z1) #The above dataset is a little too complex for the goals here, so I'm going to create a new, simplified dataframe that only has one site
## stream plot ID spp
## Length:877 Min. :1.00 Length:877 Length:877
## Class :character 1st Qu.:2.00 Class :character Class :character
## Mode :character Median :2.00 Mode :character Mode :character
## Mean :2.49
## 3rd Qu.:3.00
## Max. :4.00
## species dbh count living
## Length:877 Min. : 5.0 Min. :1 Length:877
## Class :character 1st Qu.: 21.4 1st Qu.:1 Class :character
## Mode :character Median : 32.9 Median :1 Mode :character
## Mean : 34.5 Mean :1
## 3rd Qu.: 44.9 3rd Qu.:1
## Max. :100.8 Max. :1
## abv_mass_tonn foliage_mass_tonn
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1980 1st Qu.:0.00504
## Median :0.5659 Median :0.01415
## Mean :0.8961 Mean :0.02201
## 3rd Qu.:1.1883 3rd Qu.:0.02789
## Max. :8.0888 Max. :0.20138
z <- z1[which(z1$stream == "cushman"),] #create new dataframe only using stream Cushman
z <- z[-3] # remove ID column that has lwd at the front
z$ID <- seq(1:length(z$stream)) #create new ID column with a column of integers values
Plot histogram of data
p1 <- ggplot(data=z, aes(x=dbh, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) # make histogram for DBH
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Get maximum likelihood parameters for normal
normPars <- fitdistr(z$dbh,"normal") # get the normal parameters for DBH variable
print(normPars)
## mean sd
## 32.109392 14.211785
## ( 1.776473) ( 1.256156)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 32.1 14.2
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 1.78 1.26
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 3.16 0 0 1.58
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 64
## $ loglik : num -261
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 32.10939
Plot normal probability density
meanML <- normPars$estimate["mean"] #set mean from normal parameters as manML
sdML <- normPars$estimate["sd"] #set sd from normal parameters as sdML
xval <- seq(0,max(z$dbh),len=length(z$dbh)) #a sequence of equally spaced numbers from 0 to the max of the DBH column that is the length of dbh column
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$dbh), args = list(mean = meanML, sd = sdML)) #create normal dist
p1 + stat #plot graph and normal dist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot exponential probability value
#get parameters from DBh data
expoPars <- fitdistr(z$dbh,"exponential")
#set rate based on fitdistr()
rateML <- expoPars$estimate["rate"]
#make exponential fit
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$dbh), args = list(rate=rateML))
p1 + stat + stat2 #plot histogram with all fits so far
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$dbh), args = list(min=min(z$dbh), max=max(z$dbh))) #create uniform distribution line
p1 + stat + stat2 + stat3 #plot histogram with all distributions so far
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot gamma probability density
gammaPars <- fitdistr(z$dbh,"gamma", lower = c(0,0)) # get gamma parameters
#set gamma parameters for use in gamma distibution
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$dbh), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4 #plot all distributions
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=dbh/(max(dbh + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted") #creates dotted line fit on histogram
#get and set beta parameters
betaPars <- fitdistr(x=z$dbh/max(z$dbh + 0.1),start=list(shape1=1,shape2=2),"beta", lower = c(0,0))
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$dbh), args = list(shape1=shape1ML,shape2=shape2ML)) #creates solid pink line fit on histogram
pSpecial + statSpecial #Print histogram with beta distribution lines
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
Questions 3 and 4 Simulating dataset based on best fitting distribution
z2 <- rnorm(n=length(z$ID),mean=normPars$estimate["mean"], sd=normPars$estimate["sd"])
z2 <- data.frame(1:length(z$ID),z2)
colnames(z2) <- c("ID", "myVar")
Plotting my data and simulated data distributions
#plotting simulated data with a normal distribution
p2 <- ggplot(data=z2, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
xval <- seq(0,max(z2$myVar),len=length(z2$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z2$myVar), args = list(mean = meanML, sd = sdML))
p2 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting my data with a normal distribution
xval <- seq(0,max(z$dbh),len=length(z$dbh))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$dbh), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Question 4
The two histogram profiles are relatively similar with the main distinction being that my data have more observations in the left tail, and fewer in the right tail than the simulated distributions. This makes sense because this is a distribution of DBHs, and due to the land-use history of this forest, there are few large (old) trees, and more small (young) trees. The simulated data are not capturing this dynamic very well but otherwise look okay.
I tried running the code on the the abv_mass_tonn column and the foliage_mass_tonn columns but because the datasets were very skewed, the gamma and beta distributions threw out errors. Here’s an example error “Error in stats::optim(x = c(0.0497695572426532, 0.0505961777599534, 0.13531441660897, : non-finite finite-difference value [1]”
The solutions on the internet were unfortunately not helpful in resolving the error as I was not familiar with most of the terminology being used.