Commit 501d010b authored by Ron Wehrens's avatar Ron Wehrens
Browse files

final version of lecture 2

parent 90ab6764
No preview for this file type
......@@ -472,7 +472,6 @@ welchTest <- function(mns, vrs, ns) {
\includegraphics[width=.6\textwidth]{takefive.jpg}}
\end{frame}
\begin{frame}[fragile,containsverbatim]
\frametitle{The \code{bigmemory} package}
\begin{columns}[onlytextwidth]
......@@ -551,12 +550,13 @@ welchTest <- function(mns, vrs, ns) {
\frametitle{Analysis for big data sets}
\begin{columns}[onlytextwidth]
\begin{column}<+->[t]{.45\textwidth}
The \code{biglm} package:
The \code{biglm} package (Lumley):
\begin{itemize}[<+->]
\item create a linear model using only $O(p^2)$ memory for $p$
variables
\item add more data using \code{update}
\item in this way data sets larger than memory can be handled!
\item \code{bigmemory} presents a variant for \code{big.matrix} objects
\end{itemize}
\end{column}
\begin{column}<+->[t]{.45\textwidth}
......@@ -566,7 +566,7 @@ welchTest <- function(mns, vrs, ns) {
\item no memory overhead: just the data matrix and the class
labels
\item classical \code{kmeans} at least two extra copies
\item also works with ``normal'' matrices
\item also works with ``normal'' matrices
\end{itemize}
\end{column}
\end{columns}
......@@ -594,6 +594,50 @@ welchTest <- function(mns, vrs, ns) {
\end{itemize}
\end{frame}
\begin{frame}[fragile,containsverbatim]
\frametitle{Results (on my laptop)}
\begin{Schunk} \tiny
\begin{Sinput}
## select only flights from JFK between Feb 1 amnd March 31
flights14.df <- as.data.frame(flights14)
system.time(for (i in 1:10)
huhn <- flights14.df[flights14.df$month %in% 2:3 &
flights14.df$origin == "JFK",])
## 0.088
\end{Sinput}
\end{Schunk} \pause
\begin{Schunk} \tiny
\begin{Sinput}
system.time(for (i in 1:100)
huhn <- flights14[month %in% 2:3 & origin == "JFK",])
## 0.077
\end{Sinput}
\end{Schunk} \pause
\begin{Schunk} \tiny
\begin{Sinput}
flights14.bm <- as.big.matrix(flights14)
## NOTE: all non-numerical columns are converted to factors to integers
## You CANNOT use "JFK" in the origin
system.time(for (i in 1:10)
huhn <- flights14.bm[flights14.bm[,"month"] %in% 2:3 &
flights14.bm[,"origin"] == 2,])
## 1.038
\end{Sinput}
\end{Schunk} \pause
\begin{Schunk} \tiny
\begin{Sinput}
flights14.bm2 <- as.big.matrix(flights14, backingfile = "fl14TMP")
system.time(for (i in 1:10)
huhn <- flights14.bm2[flights14.bm2[,"month"] %in% 2:3 &
flights14.bm2[,"origin"] == 2,])
## 1.172
\end{Sinput}
\end{Schunk}
\pause
Look at \code{?mwhich} for fast selection
\end{frame}
\begin{frame}[fragile,containsverbatim]
\frametitle{Big K Means}
\begin{itemize}
......@@ -601,23 +645,105 @@ welchTest <- function(mns, vrs, ns) {
flight data, using the \code{bigkmeans} function. Choose any $k$
that you like...
\item Compare the results with the results of the regular
\code{kmeans} function (from the MASS package).
\code{kmeans} function.
\item Are the clusters related to,
\code{e.g.}, season? Is the clustering meaningful, anyway?
\end{itemize}
\end{frame}
\begin{frame}[fragile,containsverbatim]
\frametitle{Big lm}
\frametitle{Timing results (on my laptop)}
\begin{Schunk}
\begin{Sinput}
> library(biganalytics)
> flights14$time <- flights14$hour*60 + flights14$min
> mymat <- flights14[,c("air_time", "distance", "time")]
> mymat <- scale(mymat)
> mymat.df <- as.data.frame(mymat)
> mymat.bm <- as.big.matrix(mymat)
> mymat.bm2 <- as.big.matrix(mymat, backingfile = "bmTMP")
\end{Sinput}
\begin{Schunk}\pause
\begin{Sinput}
system.time(huhn <- kmeans(mymat, centers = 6))
## 0.366
\end{Sinput}
\end{Schunk} \pause
\begin{Schunk}
\begin{Sinput}
system.time(huhn <- kmeans(mymat.df, centers = 6))
## 0.272
\end{Sinput}
\end{Schunk} \pause
\begin{Schunk}
\begin{Sinput}
system.time(huhn <- bigkmeans(mymat.bm, centers = 6))
## 0.187
\end{Sinput}
\end{Schunk} \pause
\begin{Schunk}
\begin{Sinput}
system.time(huhn <- bigkmeans(mymat.bm2, centers = 6))
## 0.295
\end{Sinput}
\end{Schunk}
\end{Schunk}
\end{frame}
\begin{frame}[fragile,containsverbatim]
\frametitle{Function biglm}
\begin{itemize}
\item Compare \code{biglm} and \code{lm} regression models in terms
of time and memory use: fit, \emph{e.g.}, \code{arr\_delay} as a
function of \code{distance}.
\item Add a second explanatory variable using \code{update}. Again,
compare memory usage in \code{lm} and \code{biglm}.
\item Suppose \code{flights14} is too big for your computer. Fit a
\code{biglm} model on the first half, and then update your model
with the second half. Compare the results with the one-pass model.
\end{itemize}
\end{frame}
\begin{frame}[fragile,containsverbatim]
\frametitle{Biglm results}
\begin{Schunk}\only<2>{\tiny}
\begin{Sinput}
library(profmem)
total(profmem(lmmod1 <- lm(arr_delay ~ distance,
data = flights14.df)))
## 52761760
total(profmem(lmmod2 <- biglm(arr_delay ~ distance,
data = flights14.df)))
## 46682072
total(profmem(lmmod3 <- biglm.big.matrix(arr_delay ~ distance,
data = flights14.bm)))
## 80718748
\end{Sinput}
\end{Schunk} \pause
\begin{Schunk}\scriptsize
\begin{Sinput}
N <- 150000
lmmod4 <- biglm(arr_delay ~ distance,
data = flights14.df[1:N,])
lmmod4 <- update(lmmod4,
moredata = flights14.df[(N+1):nrow(flights14.df),])
coef(lmmod2)
(Intercept) distance
9.703689724 -0.001416157
coef(lmmod4)
(Intercept) distance
9.703689724 -0.001416157
\end{Sinput}
\end{Schunk}
\end{frame}
\begin{frame}[fragile,containsverbatim]
\frametitle{Caveats and remarks}
\begin{itemize}[<+->]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment