Commit 5e96b976 authored by Kootstra, Gert's avatar Kootstra, Gert
Browse files

day3 material

parent 785c423b
import matplotlib.pyplot as plt
import numpy as np
import torch
def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
"""Plot a list of images."""
figsize = (num_cols * scale, num_rows * scale)
_, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
axes = axes.flatten()
for i, (ax, img) in enumerate(zip(axes, imgs)):
npimg = img.numpy()
ax.imshow(np.transpose(npimg, (1, 2, 0)))
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
if titles:
ax.set_title(titles[i])
return axes
def try_gpu(i=0):
"""Return gpu(i) if exists, otherwise return cpu()."""
if torch.cuda.device_count() >= i + 1:
return torch.device(f'cuda:{i}')
return torch.device('cpu')
### Imports ###
import sklearn # Scikit-learn library which contains a lot of Machine Learning Models ready to use
import pandas as pd # Library for data analysis. We will use it to load datasets
import numpy as np # Library for highly efficient computations
import matplotlib.pyplot as plt # To create plots
from sklearn.metrics import mean_squared_error, r2_score # Import some performance metrics
import statsmodels.api as sm
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
def forward_selection(X, y,
initial_list=[],
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.argmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
def plot_dataset(X, Y):
'''This function allows you to plot two different variables to visualize their relationship'''
plt.scatter(X, Y, color='blue')
plt.xlabel(X.name)
plt.ylabel(Y.name)
plt.show()
def get_X_y_data(dataset, y_name, x_names=None, reduce_wavelengths=None): ## A modified version to get X and y data from this dataset
'''Given a dataset where the target is stored in the last column, this function get the X input values
and the y target values
The argument "reduce_wavelengths" is a factor which reduces the number of wavelengths
i.e. reduce_wavelengths=2 --> the number of wavelenghts is divided by 2 --> 150/2 = 75
'''
if x_names:
X = dataset[x_names]
else:
X = dataset.drop(y_name,axis=1)
y = dataset[y_name]
if reduce_wavelengths and reduce_wavelengths!=1: # If number is provided
columns = list(X.columns) # Get the name of the variables in the dataset
columns = columns[1::reduce_wavelengths] # Reduce number of wavelengths by a factor
X = X[columns]
return X, y
def print_dataset_size(dataset):
print("The size of the dataset is", dataset.shape)
print("It has " + str(dataset.shape[0]) + " entries. It contains " +
str(dataset.shape[1]) + " variables, including the target")
def evaluate_performace_regression(model, X_train, X_test, y_train, y_test, to_print=True):
''' Function to calculate and print some performance metric for a regression model'''
y_pred_test = model.predict(X_test) # Caculate predictions from test set
y_pred_train = model.predict(X_train) # Caculate predictions from train set
mse_test = mean_squared_error(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
if to_print==True:
print('''\t\t\t Mean Squared Error \t R^2
Training set \t\t %.2f \t\t %.2f
Test set \t\t %.2f \t\t %.2f'''%(mse_train, r2_train, mse_test, r2_test))
else:
return mse_train, mse_test, r2_train, r2_test
def plot_regression_performance(model, X_test, y_test, x_title='y pred', y_title='y truth'):
'''This function allows you to plot two different variables to visualize their relationship'''
y_pred_test = model.predict(X_test) # Caculate predictions from test set
z = np.polyfit(y_pred_test, y_test, 1)
p = np.poly1d(z)
r2_test = r2_score(y_test, y_pred_test)
plt.scatter(y_pred_test, y_test, color='blue')
plt.plot(y_pred_test,p(y_pred_test), color='red', label='R2 = ' + str(round(r2_test,2)))
plt.xlabel(x_title)
plt.ylabel(y_title)
plt.legend()
plt.show()
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
def neural_network_regression(number_hidden_layers=2, units_per_layer=32):
''' This function creates a Neural Network object given the number of hidden layers and their size'''
layers = tuple([units_per_layer]*number_hidden_layers)
model = MLPRegressor(solver='lbfgs', alpha=0., max_iter=2000,
hidden_layer_sizes=layers)
return model
def neural_network_classification(number_hidden_layers=2, units_per_layer=32):
''' This function creates a Neural Network object given the number of hidden layers and their size'''
layers = tuple([units_per_layer]*number_hidden_layers)
model = MLPClassifier(hidden_layer_sizes=layers, max_iter=200, alpha=1e-4,
solver='sgd', verbose=10, random_state=1, learning_rate_init=.1)
return model
def add_poly_features(X):
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)
for i in range(X_poly.shape[1]-X.shape[1]):
X[str(i)] = X_poly[:,i]
return X
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment