Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Warris, Sven
r-big-data
Commits
501d010b
Commit
501d010b
authored
Sep 29, 2017
by
Ron Wehrens
Browse files
final version of lecture 2
parent
90ab6764
Changes
2
Hide whitespace changes
Inline
Side-by-side
slides/Ron/lecture2.pdf
View file @
501d010b
No preview for this file type
slides/Ron/lecture2.tex
View file @
501d010b
...
...
@@ -472,7 +472,6 @@ welchTest <- function(mns, vrs, ns) {
\includegraphics
[width=.6\textwidth]
{
takefive.jpg
}}
\end{frame}
\begin{frame}
[fragile,containsverbatim]
\frametitle
{
The
\code
{
bigmemory
}
package
}
\begin{columns}
[onlytextwidth]
...
...
@@ -551,12 +550,13 @@ welchTest <- function(mns, vrs, ns) {
\frametitle
{
Analysis for big data sets
}
\begin{columns}
[onlytextwidth]
\begin{column}
<+->[t]
{
.45
\textwidth
}
The
\code
{
biglm
}
package:
The
\code
{
biglm
}
package
(Lumley)
:
\begin{itemize}
[<+->]
\item
create a linear model using only
$
O
(
p
^
2
)
$
memory for
$
p
$
variables
\item
add more data using
\code
{
update
}
\item
in this way data sets larger than memory can be handled!
\item
\code
{
bigmemory
}
presents a variant for
\code
{
big.matrix
}
objects
\end{itemize}
\end{column}
\begin{column}
<+->[t]
{
.45
\textwidth
}
...
...
@@ -566,7 +566,7 @@ welchTest <- function(mns, vrs, ns) {
\item
no memory overhead: just the data matrix and the class
labels
\item
classical
\code
{
kmeans
}
at least two extra copies
\item
also works with ``normal'' matrices
\item
also works with ``normal'' matrices
\end{itemize}
\end{column}
\end{columns}
...
...
@@ -594,6 +594,50 @@ welchTest <- function(mns, vrs, ns) {
\end{itemize}
\end{frame}
\begin{frame}
[fragile,containsverbatim]
\frametitle
{
Results (on my laptop)
}
\begin{Schunk}
\tiny
\begin{Sinput}
## select only flights from JFK between Feb 1 amnd March 31
flights14.df <- as.data.frame(flights14)
system.time(for (i in 1:10)
huhn <- flights14.df[flights14.df
$
month
%in% 2:3 &
flights
14
.df
$
origin == "JFK",])
## 0.088
\end{Sinput}
\end{Schunk}
\pause
\begin{Schunk}
\tiny
\begin{Sinput}
system.time(for (i in 1:100)
huhn <- flights14[month
%in% 2:3 & origin == "JFK",])
## 0.077
\end{Sinput}
\end{Schunk}
\pause
\begin{Schunk}
\tiny
\begin{Sinput}
flights14.bm <- as.big.matrix(flights14)
## NOTE: all non-numerical columns are converted to factors to integers
## You CANNOT use "JFK" in the origin
system.time(for (i in 1:10)
huhn <- flights14.bm[flights14.bm[,"month"]
%in% 2:3 &
flights14.bm[,"origin"] == 2,])
## 1.038
\end{Sinput}
\end{Schunk}
\pause
\begin{Schunk}
\tiny
\begin{Sinput}
flights14.bm2 <- as.big.matrix(flights14, backingfile = "fl14TMP")
system.time(for (i in 1:10)
huhn <- flights14.bm2[flights14.bm2[,"month"]
%in% 2:3 &
flights14.bm2[,"origin"] == 2,])
## 1.172
\end{Sinput}
\end{Schunk}
\pause
Look at
\code
{
?mwhich
}
for fast selection
\end{frame}
\begin{frame}
[fragile,containsverbatim]
\frametitle
{
Big K Means
}
\begin{itemize}
...
...
@@ -601,23 +645,105 @@ welchTest <- function(mns, vrs, ns) {
flight data, using the
\code
{
bigkmeans
}
function. Choose any
$
k
$
that you like...
\item
Compare the results with the results of the regular
\code
{
kmeans
}
function
(from the MASS package)
.
\code
{
kmeans
}
function.
\item
Are the clusters related to,
\code
{
e.g.
}
, season? Is the clustering meaningful, anyway?
\end{itemize}
\end{frame}
\begin{frame}
[fragile,containsverbatim]
\frametitle
{
Big lm
}
\frametitle
{
Timing results (on my laptop)
}
\begin{Schunk}
\begin{Sinput}
> library(biganalytics)
> flights14
$
time <
-
flights
14
$
hour*60 + flights14
$
min
> mymat <
-
flights
14
[
,c
(
"air
_
time", "distance", "time"
)]
> mymat <
-
scale
(
mymat
)
> mymat.df <
-
as.data.frame
(
mymat
)
> mymat.bm <
-
as.big.matrix
(
mymat
)
> mymat.bm
2
<
-
as.big.matrix
(
mymat, backingfile
=
"bmTMP"
)
\end
{
Sinput
}
\begin
{
Schunk
}
\pause
\begin
{
Sinput
}
system.time
(
huhn <
-
kmeans
(
mymat, centers
=
6
))
##
0
.
366
\end
{
Sinput
}
\end
{
Schunk
}
\pause
\begin
{
Schunk
}
\begin
{
Sinput
}
system.time
(
huhn <
-
kmeans
(
mymat.df, centers
=
6
))
##
0
.
272
\end
{
Sinput
}
\end
{
Schunk
}
\pause
\begin
{
Schunk
}
\begin
{
Sinput
}
system.time
(
huhn <
-
bigkmeans
(
mymat.bm, centers
=
6
))
##
0
.
187
\end
{
Sinput
}
\end
{
Schunk
}
\pause
\begin
{
Schunk
}
\begin
{
Sinput
}
system.time
(
huhn <
-
bigkmeans
(
mymat.bm
2
, centers
=
6
))
##
0
.
295
\end
{
Sinput
}
\end
{
Schunk
}
\end
{
Schunk
}
\end
{
frame
}
\begin
{
frame
}
[
fragile,containsverbatim
]
\frametitle
{
Function biglm
}
\begin
{
itemize
}
\item
Compare
\code
{
biglm
}
and
\code
{
lm
}
regression models in terms
of time and memory use: fit,
\emph
{
e.g.
}
,
\code
{
arr
\_
delay
}
as a
function of
\code
{
distance
}
.
\item
Add a second explanatory variable using
\code
{
update
}
. Again,
compare memory usage in
\code
{
lm
}
and
\code
{
biglm
}
.
\item
Suppose
\code
{
flights
14
}
is too big for your computer. Fit a
\code
{
biglm
}
model on the first half, and then update your model
with the second half. Compare the results with the one
-
pass model.
\end
{
itemize
}
\end
{
frame
}
\begin
{
frame
}
[
fragile,containsverbatim
]
\frametitle
{
Biglm results
}
\begin
{
Schunk
}
\only
<
2
>
{
\tiny
}
\begin
{
Sinput
}
library
(
profmem
)
total
(
profmem
(
lmmod
1
<
-
lm
(
arr
_
delay ~ distance,
data
=
flights
14
.df
)))
##
52761760
total
(
profmem
(
lmmod
2
<
-
biglm
(
arr
_
delay ~ distance,
data
=
flights
14
.df
)))
##
46682072
total
(
profmem
(
lmmod
3
<
-
biglm.big.matrix
(
arr
_
delay ~ distance,
data
=
flights
14
.bm
)))
##
80718748
\end
{
Sinput
}
\end
{
Schunk
}
\pause
\begin
{
Schunk
}
\scriptsize
\begin
{
Sinput
}
N <
-
150000
lmmod
4
<
-
biglm
(
arr
_
delay ~ distance,
data
=
flights
14
.df
[
1
:N,
])
lmmod
4
<
-
update
(
lmmod
4
,
moredata
=
flights
14
.df
[(
N
+
1
)
:nrow
(
flights
14
.df
)
,
])
coef
(
lmmod
2
)
(
Intercept
)
distance
9
.
703689724
-
0
.
001416157
coef
(
lmmod
4
)
(
Intercept
)
distance
9
.
703689724
-
0
.
001416157
\end
{
Sinput
}
\end
{
Schunk
}
\end
{
frame
}
\begin
{
frame
}
[
fragile,containsverbatim
]
\frametitle
{
Caveats and remarks
}
\begin
{
itemize
}
[
<
+-
>
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment