diff --git a/.gitignore b/.gitignore
index 508df78..bbe5c9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .Rproj.user
 .Rhistory
+README.html
diff --git a/README.md b/README.md
index 28a53de..b31ce2b 100644
--- a/README.md
+++ b/README.md
@@ -12,17 +12,17 @@ Using the [stackexchange data explorer](http://data.stackexchange.com/stackoverf
 
 A little further exploration (not shown) indicates that this is very close to being exponential growth.  
 
-If we standardise the number of R questions by the number of python questions, we see that the number of R questions is increasing more rapidly than python. Currently, about 1 question about R is asked for every four questions asked about python.
+If we standardise the number of R questions by the number of python questions, we see that the number of R questions is increasing more rapidly than python. Currently, about 1 question about R is asked for every three questions asked about python.
 
 ![R questions growing relative to python](images/so-rel.png) 
 
 ## Github repos
 
-Again we see exponential growth in both repos containing R code and repos containing python code (these number don't include forks), but R repo's are relatively less common than R questions.
+Again we see exponential growth in both repos containing R code and repos containing python code (these number don't include forks), but R repo's are relatively less common than R questions. The big jump in repo creation in 2014 is probably due the [JHU coursera course](https://www.coursera.org/course/datascitoolbox).
 
 ![Explosive growth of R and python repos over time](images/github-raw.png)
 
-If we standardise the number of R repos by the number of python repos, we see that R is (slowly) catching up, but it's a long way behind.
+If we standardise the number of R repos by the number of python repos, we see that R has been decreasing since the big jump in 2015.
 
 ![R repos growing relative to python repos](images/github-rel.png) 
 
@@ -41,3 +41,7 @@ This is the data of monthly downloads made available from the [Python PyPi Packa
 * Look at use of mailing lists. Is there a pydata specific mailing list?
 * Compare twitter hashtags: rstats, python, pydata?
 * Compare package downloads?
+* Number of Kaggle solution scripts written in R versus Python.
+* Number of Machine Learning courses on MOOC sites that use R versus Python.
+* Compare attendees at big R versus big Python data conferences year-over-year.
+
diff --git a/github-1-get.R b/github-1-get.R
index d795607..0604c36 100644
--- a/github-1-get.R
+++ b/github-1-get.R
@@ -4,16 +4,15 @@ library(dplyr)
 
 # Github API -------------------------------------------------------------------
 
-user <- Sys.getenv("GITHUB_USER")
-pwd <- Sys.getenv("GITHUB_PASS")
-if (user == "" || pwd == "") {
-  stop("Set GITHUB_USER and GITHUB_PASS env vars", call. = FALSE)
+pat <- Sys.getenv("GITHUB_PAT")
+if (pat == "") {
+  stop("Set GITHUB_PAT env vars", call. = FALSE)
 }
 
 # install_github("rgithub", "cscheid")
 base <- "https://api.github.com"
 config <- c(
-  authenticate(user, pwd, type = "basic"), 
+  authenticate(pat, "", type = "basic"), 
   add_headers(
     Accept = "application/vnd.github.preview", 
     "User-Agent" = "hadley/r-python"
@@ -24,9 +23,9 @@ rate_limit <- function() {
 }
 
 repos <- function(query) {
-  Sys.sleep(60 / 20) # 20 requests per minute
+  Sys.sleep(60 / 30) # 30 requests per minute
   path <- paste0(base, "/search/repositories")
-  qs <- list(q = paste("language:r", query), per_page = 1)
+  qs <- list(q = query, per_page = 1)
   
   req <- GET(path, config, query = qs)
   stop_for_status(req)
@@ -38,23 +37,34 @@ cur_year <- year(today())
 
 past <- expand.grid(year = 2011:(cur_year - 1), month = 1:12)
 pres <- data.frame(year = cur_year, month = 1:(month(today()) - 2))
-months <- rbind(past, pres)
-
-all <- rbind(cbind(months, lang = "r"), cbind(months, lang = "python"))
-all <- all %.% group_by(lang) %.% arrange(year, month)
-all <- all %.% mutate(
-  year_next = lead(year, default = cur_year), 
-  month_next = lead(month, default = month(today()) - 1)
-)
+months <- bind_rows(past, pres) %>% 
+  arrange(year, month) %>% 
+  mutate(
+    start = as.Date(ISOdate(year, month, 1)),
+    end = start + months(1) - days(1)
+  )
 
-all$query <- paste0("language:", all$lang, 
-  " created:", all$year, "-", all$month, "..", 
-  all$year_next, "-", all$month_next)
-all$count <- NA
+all <- bind_rows(
+    months %>% mutate(lang = "r"),
+    months %>% mutate(lang = "python")
+  ) %>% 
+  mutate(
+    query = paste0("language:", lang,  " created:", start, "..", end),
+    count = NA_integer_
+  )
+# 
+# if (file.exists("github.rds")) {
+#   cached <- readRDS("github.rds") %>% ungroup() %>% mutate(lang = as.character(lang))
+#   
+#   all <- bind_rows(
+#     cached,
+#     all %>% anti_join(cached, c("year", "month", "lang"))
+#   )
+# }
 
 missing <- seq_along(all$count)[is.na(all$count)]
-for(i in missing) {
-  all$count[[i]] <- repos(all$query[i])
+for (i in missing) {
   cat(".")
+  all$count[[i]] <- repos(all$query[i])
 }
 saveRDS(all, "github.rds")
diff --git a/github-2-explore.R b/github-2-explore.R
index 11f5dfe..ae965fc 100644
--- a/github-2-explore.R
+++ b/github-2-explore.R
@@ -1,22 +1,23 @@
 library(ggplot2)
 library(dplyr)
 
-all <- ungroup(readRDS("github.rds"))
-all$query <- NULL
-all$date <- as.Date(ISOdate(all$year, all$month, 1))
-all$lang <- as.character(all$lang)
+gh <- readRDS("github.rds")
+gh$query <- NULL
 
-# For github repos, both R and python growing exponential, but python
-# is a long way ahead
-qplot(date, count, data = all, geom = "line", colour = lang)
-ggsave("images/github-raw.png", width = 8, height = 6)
+# For github repos, both R and python growing exponentially, but python
+# is a long way ahead. Big spike in R repo creation in early 2014 
+# probably due to JHU coursera course.
+ggplot(gh, aes(start, count)) + 
+  geom_line(aes(colour = lang)) +
+  scale_y_log10()
+ggsave("images/github-raw.png", width = 8, height = 6, dpi = 96)
 
-qplot(date, count, data = all, geom = "line", colour = lang) + scale_y_log10()
+rel <- gh %>% 
+  group_by(start) %>% 
+  mutate(rel = count / max(count)) %>%
+  filter(lang == "r")
 
-all_rel <- all %.% group_by(date) %.% 
-  mutate(rel = count / max(count))
-
-# Again R is catching up, but it's a lot further behind
-qplot(date, rel, data = all_rel %.% filter(lang == "r"), geom = "line") + 
-  ylab("R repos relative to python repos")
+# Steadily decline in relative usage since post 2014
+ggplot(rel, aes(start, rel)) + 
+  geom_line()
 ggsave("images/github-rel.png", width = 8, height = 6) 
diff --git a/github.rds b/github.rds
index ba6c1de..739f290 100644
Binary files a/github.rds and b/github.rds differ
diff --git a/images/github-raw.png b/images/github-raw.png
index c712c6b..b5488a4 100644
Binary files a/images/github-raw.png and b/images/github-raw.png differ
diff --git a/images/github-rel.png b/images/github-rel.png
index 51cea2c..3b51eba 100644
Binary files a/images/github-rel.png and b/images/github-rel.png differ
diff --git a/images/so-raw.png b/images/so-raw.png
index 731d122..26f0b68 100644
Binary files a/images/so-raw.png and b/images/so-raw.png differ
diff --git a/images/so-rel.png b/images/so-rel.png
index b0e183b..25f6c7f 100644
Binary files a/images/so-rel.png and b/images/so-rel.png differ
diff --git a/pydata_packages.yaml b/pydata_packages.yaml
new file mode 100644
index 0000000..ebb44df
--- /dev/null
+++ b/pydata_packages.yaml
@@ -0,0 +1,39 @@
+stackoverflow_tags:
+  - scikit-learn
+  - scikits-learn
+  - scikits
+  - scikit-image
+  - scipy
+  - numpy
+  - matplotlib
+  - matplotlib-basemap
+  - pandas
+  - networkx
+  - nltk
+  - theano
+  - h5py
+  - statsmodels
+  - ipython
+  - ipython-notebook
+  - ipython-magic
+  - ipython-parallel
+  - python-imaging-library
+  - pillow
+import_names:
+  - sklearn
+  - numpy
+  - PIL
+  - scipy
+  - matplotlib
+  - ipython
+  - pandas
+  - networkx
+  - numexpr
+  - tables
+  - patsy
+  - statsmodels
+  - sympy
+  - skimage
+  - h5py
+  - nltk
+  - theano
diff --git a/stackoverflow.R b/stackoverflow.R
index 96cb0f3..9fc8912 100644
--- a/stackoverflow.R
+++ b/stackoverflow.R
@@ -1,29 +1,28 @@
 library(ggplot2)
 library(dplyr)
+library(readr)
 
 # http://data.stackexchange.com/stackoverflow/query/150296/r-and-python-questions
-so <- read.csv("http://data.stackexchange.com/stackoverflow/csv/186078")
+so <- read_csv("http://data.stackexchange.com/stackoverflow/csv/186078")
 names(so) <- c("month", "tag", "count")
 so$month <- as.Date(so$month)
+so <- so %>% filter(month > as.Date("2010-01-01")) # too noisy before then
 
 # Explosive growth of both python and R tags
-qplot(month, count, data = so2, geom = "line", colour = tag)
-ggsave("images/so-raw.png", width = 8, height = 6)
-
-# Explore on log scale
-library(MASS)
-qplot(month, count, data = so2, geom = "line", colour = tag) + scale_y_log10() 
-# Growth pretty close to exponential for both
-recent <- filter(so2, month > as.Date("2010-01-01"))
-ggplot(recent, aes(month, count, group = tag)) + 
-  geom_smooth(method = rlm, se = F, colour = "grey50") +
+ggplot(so, aes(month, count)) + 
+  geom_smooth(aes(group = tag), method = MASS::rlm, se = F, colour = "grey70", size = 0.5) +
   geom_line(aes(colour = tag)) +
   scale_y_log10()
+ggsave("images/so-raw.png", width = 8, height = 6 , dpi = 96)
 
 # If we standardise python to 1, we see that R is growing relative to python
 # over time.
-so2 <- so %.% group_by(month) %.% 
-  mutate(rel = count / max(count))
-qplot(month, rel, data = so2, geom = "line", colour = tag) + 
+rel <- so %>% 
+  group_by(month) %>% 
+  mutate(rel = count / max(count)) %>%
+  filter(tag == "r")
+
+ggplot(rel, aes(month, rel)) + 
+  geom_line() + 
   ylab("R questions as proportion of python questions")
-ggsave("images/so-rel.png", width = 8, height = 6)
+ggsave("images/so-rel.png", width = 8, height = 6, dpi = 96)