Commit 54fb308c authored by Lie-Piang, Anouk's avatar Lie-Piang, Anouk
Browse files

Upload New File

parent 4130b36b
---
title: "R Notebook"
output:
html_notebook: default
html_document: default
---
#Import data
```{r}
library(readxl)
Component_Data <- read_excel("ComponentData_Average_data.xlsx",
sheet = "7. Training_set_SFSP") #Fill in the category to be analysed: Isolates, Mildly refined, Mixtures, Alldata
attach(Component_Data)
#detach(Component_Data)
```
```{r}
#rename variables
starch = `Starch (%)`
protein = `Protein (%)`
fibre = `Fibre (%)`
rest = `Residual components`
viscosity= log(`Final viscosity (mPa.s)`)
```
## Load libraries
```{r}
library(MASS)
library(tidyverse)
library(ggplot2)
library(heatmap3)
library(corrplot)
library(cowplot)
library(car)
library(R2HTML)
library(readxl)
```
# Create all possible formulas
```{r}
terms <- c("protein", "starch", "fibre", "rest",
"protein:starch", "protein:fibre", "starch:fibre",
"protein:starch:fibre")
model_terms <- lapply(1:length(terms), function(x) combn(terms, x)) # each col is a model
right_formulas <- lapply(model_terms, function(x) {
apply(x, 2, paste, collapse = " + ")
})
```
#Fitting the models with multiple linear regression
```{r}
Viscosity_formulas <- lapply(right_formulas, function(x) {
lapply(x, function(y) paste0("viscosity ~ ", y))
})
Viscosity_models <- lapply(Viscosity_formulas, function(x) {
lapply(x, function(y) {
lm(y, data = Component_Data)
})
})
```
# Extract the AICs and the adjusted $R^2 to see which model is the best
```{r}
Viscosity_AIC <- lapply(Viscosity_models, function(x) {
lapply(x, AIC)
})
Viscosity_R2 <- lapply(Viscosity_models, function(x) {
lapply(x, function(y) summary(y)$adj.r.squared)
})
Viscosity_quality <- lapply(1:length(Viscosity_AIC), function(x) {
out <- do.call(rbind.data.frame, Viscosity_AIC[[x]]) %>%
mutate(n_terms = x)
names(out)[1] <- "AIC"
out
}) %>% do.call(rbind.data.frame, .)
Viscosity_quality <- lapply(1:length(Viscosity_R2), function(x) {
out <- do.call(rbind.data.frame, Viscosity_R2[[x]])
names(out)[1] <- "R.squared"
out
}) %>%
do.call(rbind.data.frame, .) %>%
cbind(Viscosity_quality, .)
```
#Aikaike versus R2
```{r}
ggplot(Viscosity_quality) +
geom_point(aes(x = R.squared, y = AIC, colour = as.factor(n_terms)))
```
#finally, the selection of the right model. This is tricky. Let's arrange the models by $R^2$ and AIC.
```{r}
R2_arranged <- Viscosity_quality %>%
arrange(desc(R.squared))
R2_arranged
```
```{r}
AIC_arranged <- Viscosity_quality %>%
arrange(AIC)
AIC_arranged
```
# Print the summary of the 5 best models by AIC.
```{r}
print_models_AIC <- lapply(1:2, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
summary(my_models[[my_index]])
})
HTML2clip(print_models_AIC) #to copy results to clipboard
```
# Print 5 best models according to the adjusted $R^2$.
```{r}
#print_models_R2 <-
lapply(1:5, function(x) {
my_models <- Viscosity_models[[R2_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(R2_arranged$AIC[x] == models_AIC)
summary(my_models[[my_index]])
})
#HTML2clip(print_models_R2) #to copy results to clipboard
```
# Residuals vs fitted values for the top 5 models according to AIC.
```{r}
lapply(1:1, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
plot(my_models[[my_index]])
})
```
# Histograms of residuals
```{r}
lapply(1:1, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
g1 <- qplot(my_models[[my_index]]$residuals,
geom = "histogram",
bins = 10) +
labs(title = "Histogram of residuals",
x = "residual")
})
```
#Plot parity plot
```{r}
lapply(1:1, function(x){
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
Viscosity_lm <- my_models[[my_index]]
p1 <- data_frame(data = Viscosity_lm$model$viscosity,
pred = predict(Viscosity_lm)) %>%
ggplot() +
geom_point(aes(x = pred, y = data)) +
geom_abline(slope = 1, intercept = 0, linetype = 2) +
xlab("Predicted ln(Final Viscosity)") + ylab("Observed ln(Final Viscosity)")
print(p1)
})
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment