In my experience, when you're doing a data analysis, the biggest bottleneck is cognitive.
Hadley Wickham
brew cask install r
a <- c(1,2,3,4) b <- c("a", "b", "c", "d")
a <- c(1,2,3,4) b <- c("a", "b", "c", "d") data.frame(var1 = a, var2 = b)
## var1 var2 ## 1 1 a ## 2 2 b ## 3 3 c ## 4 4 d
floor(1.5)
## [1] 1
floor(c(1.5, 3.9))
## [1] 1 3
addFive <- function(x) { x + 5 } addFive(10)
## [1] 15
time,client_conn,client_req,cache_hit 1424123710,46,25,0 1424123711,47,52,2 1424123712,48,62,2 1424123713,49,63,2 1424123714,49,65,2
varnish_df <- read.csv("varnish-stat.csv") plot(varnish_df$client_conn ~ varnish_df$time, type = "l")
127.0.0.1 - - [12/Oct/2013:13:41:57 -0600] "GET /core/modules/shortcut/css/shortcut.icons.css HTTP/1.1" 200 499 127.0.0.1 - - [12/Oct/2013:21:25:25 -0500] "GET / HTTP/1.1" 200 9385
library("webreadr") log_df <- read_clf("access.log") str(log_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 7 variables: ## $ ip_address : chr "127.0.0.1" "127.0.0.1" "" ## $ remote_user_ident: chr NA NA NA ## $ local_user_ident : chr NA NA NA ## $ timestamp : POSIXct, format: "2013-10-12 19:41:57" "2013-10-13 02:25:25" ... ## $ request : chr "GET /core/modules/shortcut/css/shortcut.icons.css HTTP/1.1" "GET / HTTP/1.1" NA ## $ status_code : int 200 200 NA ## $ bytes_sent : int 499 9385 NA ## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 4 variables: ## ..$ row : int 3 ## ..$ col : chr NA ## ..$ expected: chr "7 columns" ## ..$ actual : chr "1 columns"
c(1, 2, 3, 4, 5)
## [1] 1 2 3 4 5
seq(as.POSIXct("2012-05-21"), by=("+1 hour"), length.out=5)
## [1] "2012-05-21 00:00:00 CEST" "2012-05-21 01:00:00 CEST" ## [3] "2012-05-21 02:00:00 CEST" "2012-05-21 03:00:00 CEST" ## [5] "2012-05-21 04:00:00 CEST"
c("A", "B", "C", "D")
## [1] "A" "B" "C" "D"
head(requests_df)
## Agent Total.Time Success Status datetime ## 1 Sydney, Australia 4.66 1 OK 2013-04-01 20:18:56 ## 2 San Jose, CA 4.59 1 OK 2013-03-27 03:49:00 ## 3 London, England 4.02 1 OK 2013-03-23 09:43:55 ## 4 Sydney, Australia 4.78 1 OK 2013-03-30 00:23:54 ## 5 Sydney, Australia 4.58 1 OK 2013-03-27 14:43:54 ## 6 Sydney, Australia 5.12 1 OK 2013-03-29 01:33:56
requests_df %>% ggplot(aes(Total.Time)) + geom_histogram(binwidth=0.2) + geom_rug()
ggplot(requests_df, aes(datetime, Total.Time)) + geom_line()
ggplot(requests_df, aes(datetime, Total.Time)) + geom_point(alpha = 0.2)
ggplot(requests_df, aes(Agent, Total.Time, colour=Agent)) + geom_boxplot() + theme(legend.position="none")
ggplot(requests_df, aes(datetime, Total.Time, colour=Agent)) + geom_line(alpha = 0.44)
ggplot(requests_df, aes(datetime, Total.Time, colour=Agent, alpha=0.4)) + geom_line() + facet_wrap(~Agent) + theme(legend.position="none")
library("tidyr") head(varnish_df)
## time client_conn client_req cache_hit ## 1 1424123710 46 25 0 ## 2 1424123711 47 52 2 ## 3 1424123712 48 62 2 ## 4 1424123713 49 63 2 ## 5 1424123714 49 65 2
df_long <- gather(varnish_df, metric, value, -time) head(df_long)
## time metric value ## 1 1424123710 client_conn 46 ## 2 1424123711 client_conn 47 ## 3 1424123712 client_conn 48 ## 4 1424123713 client_conn 49 ## 5 1424123714 client_conn 49 ## 6 1424123710 client_req 25
ggplot(df_long, aes(x=time, y=value, colour=metric)) + geom_line()
head(df_with_ms, 2)
## time metric value ## 1 1424123710 client_conn 46ms ## 2 1424123711 client_conn 47ms
df_extracted <- extract( df_with_ms, value, c("value", "unit"), "([0-9]+)(.*)" ) head(df_extracted, 2)
## time metric value unit ## 1 1424123710 client_conn 46 ms ## 2 1424123711 client_conn 47 ms
head(select(filter(requests_df, Status == "OK"), Total.Time))
==
requests_df %>% filter(Status == "OK") %>% select(Total.Time) %>% head
requests_df %>% filter(Agent == "New York, NY") %>% filter(datetime > as.POSIXct("2013-03-24") & datetime < as.POSIXct("2013-04-02")) %>% ggplot(aes(datetime, Total.Time)) + geom_line()
requests_df %>% filter(!is.na(Total.Time)) %>% group_by(Agent) %>% summarise( median = median(Total.Time), `95th` = quantile(Total.Time, 0.95), `99th` = quantile(Total.Time, 0.99), `99.9th` = quantile(Total.Time, 0.999) )
## Source: local data frame [6 x 5] ## ## Agent median 95th 99th 99.9th ## (fctr) (dbl) (dbl) (dbl) (dbl) ## 1 Buenos Aires, Argentina 4.69 6.7700 13.6552 27.68720 ## 2 Hong Kong 4.00 6.7430 9.5541 19.66839 ## 3 London, England 4.43 5.7100 6.2085 7.61725 ## 4 New York, NY 3.54 4.8200 5.4740 11.42760 ## 5 San Jose, CA 4.30 5.6985 6.9082 8.72256 ## 6 Sydney, Australia 4.75 6.5420 7.8440 9.52650
requests_df %>% mutate(Total.Time.ms = Total.Time * 1000) %>% ggplot(aes(Total.Time.ms)) + geom_histogram(binwidth=100)
summary(multimode_df$request_time_ms)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 7.334 9.299 12.320 25.560 39.440 147.500
Exploratory Data Analysis with R - Roger Peng
The Art of Data Science - Roger Peng