Click here to return to the homepage

Question 1 Repeat the exercise from the Batch Processing Lecture (5 April), but do it using real data sets rather than purely simulated. Check with folks in your lab to see if there are multiple data sets available for analysis, or ask Nick, Lauren, or Emily for suggestions for other data sources. Stick to simple data analyses and graphics, but try to set it up as a batch process that will work on multiple files and save summary results to a common file.

#The script below assesses whether there is a relationship between the diameter and the length of logs in headwater streams at the Hubbard Brook Experimental Forest in NH.  It runs a linear model on this relationship (dlwd is the x variable (diameter), and llwd is the y variable (length)) for 5 streams we surveyed in the forest.  I used the code from class and then added/changed lines as necessary for my data
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
##################################################
# function: reg_stats
# fits linear model, extracts statistics
# input: 2-column data frame (x and y)
# output: slope, p-value, and r2
#------------------------------------------------- 
reg_stats <- function(d=NULL) {
             if(is.null(d)) {
               x_var <- runif(10)
               y_var <- runif(10)
               d <- data.frame(x_var,y_var)
             }
  . <- lm(data=d,d$dlwd~d$llwd) # run linear model with diameter of logs as x variable and length as y variable
  . <- summary(.)
  stats_list <- list(slope=.$coefficients[2,1],
                    p_val=.$coefficients[2,4],
                    r2=.$r.squared) # pull out coefficients we're interested in
  return(stats_list)

}

#Body of script for batch processing of regression models

#--------------------------------------------
# Global variables
file_folder <- "homework11Data/"
n_files <- 5
file_out <- "StatsSummary.csv"
file_names <- list.files(path=file_folder)
#--------------------------------------------


# Create data frame to hold file summary statistics
ID <- seq_along(file_names)
file_name <- file_names
slope <- rep(NA,n_files)
p_val <- rep(NA,n_files)
r2 <- rep(NA,n_files)

stats_out <- data.frame(ID,file_name,slope,p_val,r2) #combine into dataframe

# batch process by looping through individual files
for (i in seq_along(file_names)) {
  data <- read.table(file=paste(file_folder,file_names[i],sep=""),
                     sep=",",
                     header=TRUE) # read in next data file
  
  d_clean <- data[complete.cases(data),] # get clean cases
  
  . <- reg_stats(d_clean) # pull regression stats from clean file
  stats_out[i,3:5] <- unlist(.) # unlist, copy into last 3 columns
  
}
# set up output file and incorporate time stamp and minimal metadata
  write.table(cat("# Summary stats for ",
                    "batch processing of regression models","\n",
                    "# timestamp: ",as.character(Sys.time()),"\n",
                    "# SPC","\n",
                    "# ------------------------", "\n",
                    "\n",
                    file=file_out,
                    row.names="",
                    col.names="",
                    sep=""))
## ""
# now add the data frame
  write.table(x=stats_out,
              file=file_out,
              row.names=FALSE,
              col.names=TRUE,
              sep=",",
              append=TRUE)
## Warning in write.table(x = stats_out, file = file_out, row.names = FALSE, :
## appending column names to file
StatsSummary <- read.table(file = "StatsSummary.csv", header = TRUE, sep = ",")

print(StatsSummary)
##   ID       file_name     slope        p_val          r2
## 1  1 BagleyTrail.csv 0.6578014 4.563851e-01 0.011858325
## 2  2       Crazy.csv 0.7462687 4.991573e-01 0.009372291
## 3  3     Cushman.csv 2.6766607 7.456232e-06 0.211412620
## 4  4       Falls.csv 1.6315751 1.296709e-03 0.129662955
## 5  5      Kineo1.csv 1.8066935 2.011457e-02 0.060537203