Commit e32b6fa2 authored by Lie-Piang, Anouk's avatar Lie-Piang, Anouk
Browse files

Upload New File

parent 9388ed1b
---
title: "R Notebook"
output:
html_notebook: default
html_document: default
---
# Set WD
```{r}
setwd("C:/ModelsLocal")
```
#Import data
```{r}
library(readxl)
Component_Data <- read_excel("ComponentData_Average_data.xlsx",
sheet = "7. Training_set_SFSP") #Fill in the category to be analysed: Isolates, Mildly refined, Mixtures, Alldata
attach(Component_Data)
#detach(Component_Data)
```
#Check correlations
```{r}
corcheck <- read.csv("CorrelationCheck.csv")
cor(corcheck)
library(corrplot)
library(RColorBrewer)
M <-cor(corcheck)
corrplot(M, type="upper", order="hclust",
col=brewer.pal(n=8, name="RdYlBu"))
```
```{r}
#rename variables
starch = `Starch (%)`
protein = `Protein (%)`
fibre = `Fibre (%)`
rest = `Residual components`
viscosity= log(`Final viscosity (mPa.s)`)
```
## Load libraries
```{r}
library(MASS)
library(tidyverse)
library(ggplot2)
library(heatmap3)
library(corrplot)
library(cowplot)
library(car)
library(R2HTML)
library(readxl)
```
# Create all possible formulas
```{r}
terms <- c("protein", "starch", "fibre", "rest",
"protein:starch", "protein:fibre", "starch:fibre",
"protein:starch:fibre")
model_terms <- lapply(1:length(terms), function(x) combn(terms, x)) # each col is a model
right_formulas <- lapply(model_terms, function(x) {
apply(x, 2, paste, collapse = " + ")
})
```
#Fitting the models with multiple linear regression
```{r}
Viscosity_formulas <- lapply(right_formulas, function(x) {
lapply(x, function(y) paste0("viscosity ~ ", y))
})
Viscosity_models <- lapply(Viscosity_formulas, function(x) {
lapply(x, function(y) {
lm(y, data = Component_Data)
})
})
```
# Extract the AICs and the adjusted $R^2 to see which model is the best
```{r}
Viscosity_AIC <- lapply(Viscosity_models, function(x) {
lapply(x, AIC)
})
Viscosity_R2 <- lapply(Viscosity_models, function(x) {
lapply(x, function(y) summary(y)$adj.r.squared)
})
Viscosity_quality <- lapply(1:length(Viscosity_AIC), function(x) {
out <- do.call(rbind.data.frame, Viscosity_AIC[[x]]) %>%
mutate(n_terms = x)
names(out)[1] <- "AIC"
out
}) %>% do.call(rbind.data.frame, .)
Viscosity_quality <- lapply(1:length(Viscosity_R2), function(x) {
out <- do.call(rbind.data.frame, Viscosity_R2[[x]])
names(out)[1] <- "R.squared"
out
}) %>%
do.call(rbind.data.frame, .) %>%
cbind(Viscosity_quality, .)
```
#Aikaike versus R2
```{r}
ggplot(Viscosity_quality) +
geom_point(aes(x = R.squared, y = AIC, colour = as.factor(n_terms)))
```
#finally, the selection of the right model. This is tricky. Let's arrange the models by $R^2$ and AIC.
```{r}
R2_arranged <- Viscosity_quality %>%
arrange(desc(R.squared))
R2_arranged
```
```{r}
AIC_arranged <- Viscosity_quality %>%
arrange(AIC)
AIC_arranged
```
# Print the summary of the 5 best models by AIC.
```{r}
print_models_AIC <- lapply(1:2, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
summary(my_models[[my_index]])
})
HTML2clip(print_models_AIC) #to copy results to clipboard
#Other methods
#HTML2clip(summary(my_models[[my_index]]), filename = "cli[p")
#write.csv(as.data.frame(summary(my_models[[my_index]]$)),file = "mymodels_1.csv")
#sink(file = "sinktest.csv")
print(print_models_AIC)
#sink()
```
# Print 5 best models according to the adjusted $R^2$.
```{r}
#print_models_R2 <-
lapply(1:5, function(x) {
my_models <- Viscosity_models[[R2_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(R2_arranged$AIC[x] == models_AIC)
summary(my_models[[my_index]])
})
#HTML2clip(print_models_R2) #to copy results to clipboard
```
# Residuals vs fitted values for the top 5 models according to AIC.
```{r}
lapply(1:1, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
plot(my_models[[my_index]])
})
```
#Predict
```{r}
lapply(1:1, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
predict(my_models[[my_index]], data.frame(viscosity = 10))
})
```
#Extracting data point from QQ plot to see the deviating points
```{r}
lapply(1:1, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
se <- sqrt(sum(my_models[[my_index]]$residuals^2) / my_models[[my_index]]$df.residual) ## Pearson residual standard error
hii <- lm.influence(my_models[[my_index]], do.coef = FALSE)$hat ## leverage
std.resi <- my_models[[my_index]]$residuals / (se * sqrt(1 - hii)) ## standardized residuals
par(mfrow = c(1,2))
qqnorm(std.resi, main = "my Q-Q"); qqline(std.resi, lty = 2)
plot(my_models[[my_index]], which = 2) ## only display Q-Q plot
x <- sort(abs(std.resi), decreasing = TRUE)
id <- as.integer(names(x))
id[1:20] #fill in the number of amount of data points to be extrated
})
```
# Histograms of residuals
```{r}
lapply(1:1, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
g1 <- qplot(my_models[[my_index]]$residuals,
geom = "histogram",
bins = 10) +
labs(title = "Histogram of residuals",
x = "residual")
})
```
#We check normality of the residuals with the Shapiro-Wilk normality test:
```{r}
lapply(1:5, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
shapiro.test(residuals(my_models[[my_index]]))
})
```
```{r}
lapply(1:1, function(x){
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
Viscosity_lm <- my_models[[my_index]]
p1 <- data_frame(data = Viscosity_lm$model$viscosity,
pred = predict(Viscosity_lm)) %>%
ggplot() +
geom_point(aes(x = pred, y = data)) +
geom_abline(slope = 1, intercept = 0, linetype = 2) +
xlab("Predicted ln(Final Viscosity)") + ylab("Observed ln(Final Viscosity)")
print(p1)
})
```
Durbin-Watson test:
```{r}
lapply(1:5, function(x) {
my_models <- Viscosity_models[[AIC_arranged$n_terms[x]]]
models_AIC <- lapply(my_models, AIC)
my_index <- which(AIC_arranged$AIC[x] == models_AIC)
durbinWatsonTest(residuals(my_models[[my_index]]))
})
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment