Challenge 1
Read new_diamonds dataset and see compute time differences when specifying columns while using read.csv(). Do this analysis using profvis.
Solution
library(profvis)
# newDiamonds=rbind(diamonds, diamonds,diamonds, diamonds)
# newDiamonds2 = newDiamonds %>% select(carat,cut, price)
# write.csv2(newDiamonds2,file="new_diamonds.csv",row.names=FALSE)
profvis({
new_diamonds = read.csv2("datasets/new_diamonds.csv")
new_diamonds2 = read.csv2("datasets/new_diamonds.csv", colClasses = c("numeric",
"character","numeric"))
})
Challenge 2
Verify the differences in compute time when calculating means of the numeric values of new_diamonds
.
- Subset the relevant columns
new_diamonds_num = new_diamonds = new_diamonds[,c(1,3)]
- Use microbenchmark on
colMeans()
and apply
Solution
library(microbenchmark)
diamonds_num = new_diamonds[,c(1,3)]
microbenchmark(
base::colMeans(diamonds_num),
apply(diamonds_num,2,mean)
)
Unit: milliseconds
Challenge 3
Verify the compute time differences between:
- Preallocating the memory and create a new vector with
newvector=1:1000
- Create the same vector by generating a loop that grows the vector size with
newvector=c(newvector,x)
Solution
create_vector =function(n){
vec = c()
for(i in 1:n){
vec=c(vec,i)
}
}
microbenchmark(
newvector = c(1:1000),
newvector2 = create_vector(1000)
)
Unit: microseconds
Challenge 4
What does the lapply2 function do? Check compute time difference between lapply2
and lapply2_c
. - To do this you could appley the function sd
to x=list(1:200,1:100,100:340,10:90000)
Solution
lapply2
applies a function to each elelement of a list.
lapply2 <- function(x, f, ...) {
out <- vector("list", length(x))
for (i in seq_along(x)) {
out[[i]] <- f(x[[i]], ...)
}
out
}
lapply2_c <- compiler::cmpfun(lapply2)
x=list(1:200,1:100,100:340,10:90000)
microbenchmark(
lapply2(x,sd),
lapply2_c(x,sd),
lapply(x,sd)
)
Unit: microseconds
Challenge 5
Modify the add
function to return multiplication for 3 integers and check compute time differences with standard R operator. Do you expect a difference?
Can you extend the function to multiplication of 3 number in general?
Solution
library(Rcpp)
cppFunction('int mult(int x, int y) {
int prod = x * y;
return prod;
}')
x=17635
y=10002
microbenchmark(
a=x*y,
b=mult(x,y)
)
Unit: nanoseconds
Challenge 6 (Advanced)
Sample 20 passengers from the titanic and calculate the mean_age. Do this 10000 times (in parallel).
Use the titanic DataSet: titanic <- read.csv("https://goo.gl/4Gqsnz")
- Does the code run faster if run in parallel?
Tip: Remove NA’s using na.omit(titanic)
Solution
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(parallel)
titanic = read.csv("https://goo.gl/4Gqsnz")
titanic= na.omit(titanic)
sample_mean =function(i){
mean_Age = titanic %>%
sample_n(20) %>%
summarise(mean_age=mean(Age)) %>%
pull(mean_age)
return(mean_Age)
}
system.time(lapply(1:10000, sample_mean))
user system elapsed
16.005 0.163 17.155
system.time(mclapply(1:10000, sample_mean))
user system elapsed
8.704 0.261 9.362
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkKYGBgCgojIENoYWxsZW5nZSAxCgoqKlJlYWQgbmV3X2RpYW1vbmRzIGRhdGFzZXQgYW5kIHNlZSBjb21wdXRlIHRpbWUgZGlmZmVyZW5jZXMgd2hlbiBzcGVjaWZ5aW5nIGNvbHVtbnMgd2hpbGUgdXNpbmcgcmVhZC5jc3YoKS4gRG8gdGhpcyBhbmFseXNpcyB1c2luZyBwcm9mdmlzLioqCgojIyMgU29sdXRpb24KYGBge3J9CmxpYnJhcnkocHJvZnZpcykKIyBuZXdEaWFtb25kcz1yYmluZChkaWFtb25kcywgZGlhbW9uZHMsZGlhbW9uZHMsIGRpYW1vbmRzKQojIG5ld0RpYW1vbmRzMiA9IG5ld0RpYW1vbmRzICU+JSBzZWxlY3QoY2FyYXQsY3V0LCBwcmljZSkKIyB3cml0ZS5jc3YyKG5ld0RpYW1vbmRzMixmaWxlPSJuZXdfZGlhbW9uZHMuY3N2Iixyb3cubmFtZXM9RkFMU0UpCgpwcm9mdmlzKHsKbmV3X2RpYW1vbmRzID0gcmVhZC5jc3YyKCJkYXRhc2V0cy9uZXdfZGlhbW9uZHMuY3N2IikKCm5ld19kaWFtb25kczIgPSByZWFkLmNzdjIoImRhdGFzZXRzL25ld19kaWFtb25kcy5jc3YiLCBjb2xDbGFzc2VzID0gYygibnVtZXJpYyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAiY2hhcmFjdGVyIiwibnVtZXJpYyIpKQp9KQoKYGBgCgojIENoYWxsZW5nZSAyCgpWZXJpZnkgdGhlIGRpZmZlcmVuY2VzIGluIGNvbXB1dGUgdGltZSB3aGVuIGNhbGN1bGF0aW5nIG1lYW5zIG9mIHRoZSBudW1lcmljIHZhbHVlcyBvZiBgbmV3X2RpYW1vbmRzYC4KCi0gU3Vic2V0IHRoZSByZWxldmFudCBjb2x1bW5zIGBuZXdfZGlhbW9uZHNfbnVtID0gbmV3X2RpYW1vbmRzID0gbmV3X2RpYW1vbmRzWyxjKDEsMyldYAotIFVzZSBtaWNyb2JlbmNobWFyayBvbiBgY29sTWVhbnMoKWAgYW5kICBgYXBwbHlgCgojIyMgU29sdXRpb24KYGBge3J9CgpsaWJyYXJ5KG1pY3JvYmVuY2htYXJrKQoKZGlhbW9uZHNfbnVtID0gbmV3X2RpYW1vbmRzWyxjKDEsMyldCgptaWNyb2JlbmNobWFyaygKICBiYXNlOjpjb2xNZWFucyhkaWFtb25kc19udW0pLAogIGFwcGx5KGRpYW1vbmRzX251bSwyLG1lYW4pCikKCgpgYGAKCiMgQ2hhbGxlbmdlIDMKCioqVmVyaWZ5IHRoZSBjb21wdXRlIHRpbWUgZGlmZmVyZW5jZXMgYmV0d2VlbjoqKgoKMi4gUHJlYWxsb2NhdGluZyB0aGUgbWVtb3J5IGFuZCBjcmVhdGUgYSBuZXcgdmVjdG9yIHdpdGggYG5ld3ZlY3Rvcj0xOjEwMDBgCjEuIENyZWF0ZSB0aGUgc2FtZSB2ZWN0b3IgYnkgZ2VuZXJhdGluZyBhIGxvb3AgdGhhdCBncm93cyB0aGUgdmVjdG9yIHNpemUgd2l0aCBgbmV3dmVjdG9yPWMobmV3dmVjdG9yLHgpYAoKIyMjIFNvbHV0aW9uCmBgYHtyfQoKY3JlYXRlX3ZlY3RvciA9ZnVuY3Rpb24obil7CiAgdmVjID0gYygpCiAgZm9yKGkgaW4gMTpuKXsKICAgIHZlYz1jKHZlYyxpKQogIH0KfQoKbWljcm9iZW5jaG1hcmsoCiAgbmV3dmVjdG9yID0gYygxOjEwMDApLAogIG5ld3ZlY3RvcjIgPSBjcmVhdGVfdmVjdG9yKDEwMDApCikKYGBgCgoKIyBDaGFsbGVuZ2UgNAoKKipXaGF0IGRvZXMgdGhlIGxhcHBseTIgZnVuY3Rpb24gZG8/IENoZWNrIGNvbXB1dGUgdGltZSBkaWZmZXJlbmNlIGJldHdlZW4gYGxhcHBseTJgIGFuZCBgbGFwcGx5Ml9jYC4qKgotIFRvIGRvIHRoaXMgeW91IGNvdWxkIGFwcGxleSB0aGUgZnVuY3Rpb24gYHNkYCB0byBgeD1saXN0KDE6MjAwLDE6MTAwLDEwMDozNDAsMTA6OTAwMDApYAoKIyMjIFNvbHV0aW9uCgpgbGFwcGx5MmAgYXBwbGllcyBhIGZ1bmN0aW9uIHRvIGVhY2ggZWxlbGVtZW50IG9mIGEgbGlzdC4KYGBge3IgZXZhbD1UUlVFfQpsYXBwbHkyIDwtIGZ1bmN0aW9uKHgsIGYsIC4uLikgewogIG91dCA8LSB2ZWN0b3IoImxpc3QiLCBsZW5ndGgoeCkpCiAgZm9yIChpIGluIHNlcV9hbG9uZyh4KSkgewogICAgb3V0W1tpXV0gPC0gZih4W1tpXV0sIC4uLikKICB9CiAgb3V0Cn0KCmxhcHBseTJfYyA8LSBjb21waWxlcjo6Y21wZnVuKGxhcHBseTIpCgp4PWxpc3QoMToyMDAsMToxMDAsMTAwOjM0MCwxMDo5MDAwMCkKbWljcm9iZW5jaG1hcmsoCiAgbGFwcGx5Mih4LHNkKSwKICBsYXBwbHkyX2MoeCxzZCksCiAgbGFwcGx5KHgsc2QpCikKCmBgYAoKIyBDaGFsbGVuZ2UgNQoKTW9kaWZ5IHRoZSBgYWRkYCBmdW5jdGlvbiB0byByZXR1cm4gbXVsdGlwbGljYXRpb24gZm9yIDMgaW50ZWdlcnMgYW5kIGNoZWNrIGNvbXB1dGUgdGltZSBkaWZmZXJlbmNlcyB3aXRoIHN0YW5kYXJkIFIgb3BlcmF0b3IuIERvIHlvdSBleHBlY3QgYSBkaWZmZXJlbmNlPwoKQ2FuIHlvdSBleHRlbmQgdGhlIGZ1bmN0aW9uIHRvIG11bHRpcGxpY2F0aW9uIG9mIDMgbnVtYmVyIGluIGdlbmVyYWw/CgojIyMgU29sdXRpb24KYGBge3J9CmxpYnJhcnkoUmNwcCkKCmNwcEZ1bmN0aW9uKCdpbnQgbXVsdChpbnQgeCwgaW50IHkpIHsKICBpbnQgcHJvZCA9IHggKiB5OwogIHJldHVybiBwcm9kOwp9JykKCgp4PTE3NjM1Cnk9MTAwMDIKCm1pY3JvYmVuY2htYXJrKAogIGE9eCp5LAogIGI9bXVsdCh4LHkpCikKCmBgYAoKCiMgQ2hhbGxlbmdlIDYgKEFkdmFuY2VkKQoKU2FtcGxlIDIwIHBhc3NlbmdlcnMgZnJvbSB0aGUgdGl0YW5pYyBhbmQgY2FsY3VsYXRlIHRoZSBtZWFuX2FnZS4gRG8gdGhpcyAxMDAwMCB0aW1lcyAoaW4gcGFyYWxsZWwpLgoKVXNlIHRoZSB0aXRhbmljIERhdGFTZXQ6IGB0aXRhbmljIDwtIHJlYWQuY3N2KCJodHRwczovL2dvby5nbC80R3FzbnoiKWAKCi0gRG9lcyB0aGUgY29kZSBydW4gZmFzdGVyIGlmIHJ1biBpbiBwYXJhbGxlbD8KCipUaXA6IFJlbW92ZSBOQSdzIHVzaW5nIGBuYS5vbWl0KHRpdGFuaWMpYCoKCiMjIyBTb2x1dGlvbgpgYGB7cn0KCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkocGFyYWxsZWwpCnRpdGFuaWMgPSByZWFkLmNzdigiaHR0cHM6Ly9nb28uZ2wvNEdxc256IikKdGl0YW5pYz0gbmEub21pdCh0aXRhbmljKQoKc2FtcGxlX21lYW4gPWZ1bmN0aW9uKGkpewogIG1lYW5fQWdlID0gdGl0YW5pYyAlPiUgCiAgICBzYW1wbGVfbigyMCkgJT4lIAogICAgc3VtbWFyaXNlKG1lYW5fYWdlPW1lYW4oQWdlKSkgJT4lIAogICAgcHVsbChtZWFuX2FnZSkKICByZXR1cm4obWVhbl9BZ2UpCn0KCnN5c3RlbS50aW1lKGxhcHBseSgxOjEwMDAwLCBzYW1wbGVfbWVhbikpCgpzeXN0ZW0udGltZShtY2xhcHBseSgxOjEwMDAwLCBzYW1wbGVfbWVhbikpCgpgYGAKCgoKCg==