Click here to return to the main page

Read in simulated data vector (excluded)

# quick and dirty, a truncated normal distribution to work on the solution set

#z <- rnorm(n=3000,mean=0.2)
#z <- data.frame(1:3000,z)
#names(z) <- list("ID","myVar")
#z <- z[z$myVar>0,]
#str(z)
#summary(z$myVar)

Read in own data

setwd("C:/Users/steph/OneDrive - University of Vermont/Documents/Dissertation_Research/StreamCarbon") #Set working directory where data is located
z1 <- read.csv("hannah_hbef_forest_r.csv")
str(z1)
## 'data.frame':    877 obs. of  10 variables:
##  $ stream           : chr  "kineo1" "kineo1" "kineo1" "kineo1" ...
##  $ plot             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ID               : chr  "tree0052" "tree0053" "tree0054" "tree0055" ...
##  $ spp              : chr  "yellow birch" "sugar maple" "yellow birch" "sugar maple" ...
##  $ species          : chr  "BEAL" "ACSA" "BEAL" "ACSA" ...
##  $ dbh              : num  50.8 13 55.1 18.2 45.5 ...
##  $ count            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ living           : chr  "alive" "alive" "alive" "alive" ...
##  $ abv_mass_tonn    : num  1.5997 0.0688 1.9386 0.156 1.2327 ...
##  $ foliage_mass_tonn: num  0.03033 0.00183 0.03642 0.00364 0.02369 ...
summary(z1) #The above dataset is a little too complex for the goals here, so I'm going to create a new, simplified dataframe that only has one site 
##     stream               plot           ID                spp           
##  Length:877         Min.   :1.00   Length:877         Length:877        
##  Class :character   1st Qu.:2.00   Class :character   Class :character  
##  Mode  :character   Median :2.00   Mode  :character   Mode  :character  
##                     Mean   :2.49                                        
##                     3rd Qu.:3.00                                        
##                     Max.   :4.00                                        
##    species               dbh            count      living         
##  Length:877         Min.   :  5.0   Min.   :1   Length:877        
##  Class :character   1st Qu.: 21.4   1st Qu.:1   Class :character  
##  Mode  :character   Median : 32.9   Median :1   Mode  :character  
##                     Mean   : 34.5   Mean   :1                     
##                     3rd Qu.: 44.9   3rd Qu.:1                     
##                     Max.   :100.8   Max.   :1                     
##  abv_mass_tonn    foliage_mass_tonn
##  Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1980   1st Qu.:0.00504  
##  Median :0.5659   Median :0.01415  
##  Mean   :0.8961   Mean   :0.02201  
##  3rd Qu.:1.1883   3rd Qu.:0.02789  
##  Max.   :8.0888   Max.   :0.20138
z <- z1[which(z1$stream == "cushman"),] #create new dataframe only using stream Cushman

z <- z[-3] # remove ID column that has lwd at the front

z$ID <- seq(1:length(z$stream)) #create new ID column with a column of integers values

Plot histogram of data

p1 <- ggplot(data=z, aes(x=dbh, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2)  # make histogram for DBH
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get maximum likelihood parameters for normal

normPars <- fitdistr(z$dbh,"normal") # get the normal parameters for DBH variable
print(normPars)
##      mean         sd    
##   32.109392   14.211785 
##  ( 1.776473) ( 1.256156)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 32.1 14.2
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 1.78 1.26
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 3.16 0 0 1.58
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 64
##  $ loglik  : num -261
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##     mean 
## 32.10939

Plot normal probability density

meanML <- normPars$estimate["mean"] #set mean from normal parameters as manML
sdML <- normPars$estimate["sd"] #set sd from normal parameters as sdML

xval <- seq(0,max(z$dbh),len=length(z$dbh)) #a sequence of equally spaced numbers from 0 to the max of the DBH column that is the length of dbh column

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$dbh), args = list(mean = meanML, sd = sdML)) #create normal dist
 p1 + stat #plot graph and normal dist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot exponential probability value

#get parameters from DBh data
expoPars <- fitdistr(z$dbh,"exponential")
#set rate based on fitdistr()
rateML <- expoPars$estimate["rate"]

#make exponential fit
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$dbh), args = list(rate=rateML))
 p1 + stat + stat2 #plot histogram with all fits so far
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot uniform probability density

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$dbh), args = list(min=min(z$dbh), max=max(z$dbh))) #create uniform distribution line
 p1 + stat + stat2 + stat3 #plot histogram with all distributions so far
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability density

gammaPars <- fitdistr(z$dbh,"gamma", lower = c(0,0)) # get gamma parameters

#set gamma parameters for use in gamma distibution
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$dbh), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4 #plot all distributions
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot beta probability density

pSpecial <- ggplot(data=z, aes(x=dbh/(max(dbh + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted") #creates dotted line fit on histogram

#get and set beta parameters
betaPars <- fitdistr(x=z$dbh/max(z$dbh + 0.1),start=list(shape1=1,shape2=2),"beta", lower = c(0,0))
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$dbh), args = list(shape1=shape1ML,shape2=shape2ML)) #creates solid pink line fit on histogram
pSpecial + statSpecial #Print histogram with beta distribution lines
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

Questions 3 and 4 Simulating dataset based on best fitting distribution

z2 <- rnorm(n=length(z$ID),mean=normPars$estimate["mean"], sd=normPars$estimate["sd"])
z2 <- data.frame(1:length(z$ID),z2)
colnames(z2) <- c("ID", "myVar")

Plotting my data and simulated data distributions

#plotting simulated data with a normal distribution
 p2 <- ggplot(data=z2, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 

xval <- seq(0,max(z2$myVar),len=length(z2$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z2$myVar), args = list(mean = meanML, sd = sdML))
 p2 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 #plotting my data with a normal distribution

xval <- seq(0,max(z$dbh),len=length(z$dbh))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$dbh), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Question 4

The two histogram profiles are relatively similar with the main distinction being that my data have more observations in the left tail, and fewer in the right tail than the simulated distributions. This makes sense because this is a distribution of DBHs, and due to the land-use history of this forest, there are few large (old) trees, and more small (young) trees. The simulated data are not capturing this dynamic very well but otherwise look okay.

I tried running the code on the the abv_mass_tonn column and the foliage_mass_tonn columns but because the datasets were very skewed, the gamma and beta distributions threw out errors. Here’s an example error “Error in stats::optim(x = c(0.0497695572426532, 0.0505961777599534, 0.13531441660897, : non-finite finite-difference value [1]”

The solutions on the internet were unfortunately not helpful in resolving the error as I was not familiar with most of the terminology being used.