diff --git a/.gitignore b/.gitignore index 508df78..bbe5c9e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .Rproj.user .Rhistory +README.html diff --git a/README.md b/README.md index 28a53de..b31ce2b 100644 --- a/README.md +++ b/README.md @@ -12,17 +12,17 @@ Using the [stackexchange data explorer](http://data.stackexchange.com/stackoverf A little further exploration (not shown) indicates that this is very close to being exponential growth. -If we standardise the number of R questions by the number of python questions, we see that the number of R questions is increasing more rapidly than python. Currently, about 1 question about R is asked for every four questions asked about python. +If we standardise the number of R questions by the number of python questions, we see that the number of R questions is increasing more rapidly than python. Currently, about 1 question about R is asked for every three questions asked about python. ![R questions growing relative to python](images/so-rel.png) ## Github repos -Again we see exponential growth in both repos containing R code and repos containing python code (these number don't include forks), but R repo's are relatively less common than R questions. +Again we see exponential growth in both repos containing R code and repos containing python code (these number don't include forks), but R repo's are relatively less common than R questions. The big jump in repo creation in 2014 is probably due the [JHU coursera course](https://www.coursera.org/course/datascitoolbox). ![Explosive growth of R and python repos over time](images/github-raw.png) -If we standardise the number of R repos by the number of python repos, we see that R is (slowly) catching up, but it's a long way behind. +If we standardise the number of R repos by the number of python repos, we see that R has been decreasing since the big jump in 2015. ![R repos growing relative to python repos](images/github-rel.png) @@ -41,3 +41,7 @@ This is the data of monthly downloads made available from the [Python PyPi Packa * Look at use of mailing lists. Is there a pydata specific mailing list? * Compare twitter hashtags: rstats, python, pydata? * Compare package downloads? +* Number of Kaggle solution scripts written in R versus Python. +* Number of Machine Learning courses on MOOC sites that use R versus Python. +* Compare attendees at big R versus big Python data conferences year-over-year. + diff --git a/github-1-get.R b/github-1-get.R index d795607..0604c36 100644 --- a/github-1-get.R +++ b/github-1-get.R @@ -4,16 +4,15 @@ library(dplyr) # Github API ------------------------------------------------------------------- -user <- Sys.getenv("GITHUB_USER") -pwd <- Sys.getenv("GITHUB_PASS") -if (user == "" || pwd == "") { - stop("Set GITHUB_USER and GITHUB_PASS env vars", call. = FALSE) +pat <- Sys.getenv("GITHUB_PAT") +if (pat == "") { + stop("Set GITHUB_PAT env vars", call. = FALSE) } # install_github("rgithub", "cscheid") base <- "https://api.github.com" config <- c( - authenticate(user, pwd, type = "basic"), + authenticate(pat, "", type = "basic"), add_headers( Accept = "application/vnd.github.preview", "User-Agent" = "hadley/r-python" @@ -24,9 +23,9 @@ rate_limit <- function() { } repos <- function(query) { - Sys.sleep(60 / 20) # 20 requests per minute + Sys.sleep(60 / 30) # 30 requests per minute path <- paste0(base, "/search/repositories") - qs <- list(q = paste("language:r", query), per_page = 1) + qs <- list(q = query, per_page = 1) req <- GET(path, config, query = qs) stop_for_status(req) @@ -38,23 +37,34 @@ cur_year <- year(today()) past <- expand.grid(year = 2011:(cur_year - 1), month = 1:12) pres <- data.frame(year = cur_year, month = 1:(month(today()) - 2)) -months <- rbind(past, pres) - -all <- rbind(cbind(months, lang = "r"), cbind(months, lang = "python")) -all <- all %.% group_by(lang) %.% arrange(year, month) -all <- all %.% mutate( - year_next = lead(year, default = cur_year), - month_next = lead(month, default = month(today()) - 1) -) +months <- bind_rows(past, pres) %>% + arrange(year, month) %>% + mutate( + start = as.Date(ISOdate(year, month, 1)), + end = start + months(1) - days(1) + ) -all$query <- paste0("language:", all$lang, - " created:", all$year, "-", all$month, "..", - all$year_next, "-", all$month_next) -all$count <- NA +all <- bind_rows( + months %>% mutate(lang = "r"), + months %>% mutate(lang = "python") + ) %>% + mutate( + query = paste0("language:", lang, " created:", start, "..", end), + count = NA_integer_ + ) +# +# if (file.exists("github.rds")) { +# cached <- readRDS("github.rds") %>% ungroup() %>% mutate(lang = as.character(lang)) +# +# all <- bind_rows( +# cached, +# all %>% anti_join(cached, c("year", "month", "lang")) +# ) +# } missing <- seq_along(all$count)[is.na(all$count)] -for(i in missing) { - all$count[[i]] <- repos(all$query[i]) +for (i in missing) { cat(".") + all$count[[i]] <- repos(all$query[i]) } saveRDS(all, "github.rds") diff --git a/github-2-explore.R b/github-2-explore.R index 11f5dfe..ae965fc 100644 --- a/github-2-explore.R +++ b/github-2-explore.R @@ -1,22 +1,23 @@ library(ggplot2) library(dplyr) -all <- ungroup(readRDS("github.rds")) -all$query <- NULL -all$date <- as.Date(ISOdate(all$year, all$month, 1)) -all$lang <- as.character(all$lang) +gh <- readRDS("github.rds") +gh$query <- NULL -# For github repos, both R and python growing exponential, but python -# is a long way ahead -qplot(date, count, data = all, geom = "line", colour = lang) -ggsave("images/github-raw.png", width = 8, height = 6) +# For github repos, both R and python growing exponentially, but python +# is a long way ahead. Big spike in R repo creation in early 2014 +# probably due to JHU coursera course. +ggplot(gh, aes(start, count)) + + geom_line(aes(colour = lang)) + + scale_y_log10() +ggsave("images/github-raw.png", width = 8, height = 6, dpi = 96) -qplot(date, count, data = all, geom = "line", colour = lang) + scale_y_log10() +rel <- gh %>% + group_by(start) %>% + mutate(rel = count / max(count)) %>% + filter(lang == "r") -all_rel <- all %.% group_by(date) %.% - mutate(rel = count / max(count)) - -# Again R is catching up, but it's a lot further behind -qplot(date, rel, data = all_rel %.% filter(lang == "r"), geom = "line") + - ylab("R repos relative to python repos") +# Steadily decline in relative usage since post 2014 +ggplot(rel, aes(start, rel)) + + geom_line() ggsave("images/github-rel.png", width = 8, height = 6) diff --git a/github.rds b/github.rds index ba6c1de..739f290 100644 Binary files a/github.rds and b/github.rds differ diff --git a/images/github-raw.png b/images/github-raw.png index c712c6b..b5488a4 100644 Binary files a/images/github-raw.png and b/images/github-raw.png differ diff --git a/images/github-rel.png b/images/github-rel.png index 51cea2c..3b51eba 100644 Binary files a/images/github-rel.png and b/images/github-rel.png differ diff --git a/images/so-raw.png b/images/so-raw.png index 731d122..26f0b68 100644 Binary files a/images/so-raw.png and b/images/so-raw.png differ diff --git a/images/so-rel.png b/images/so-rel.png index b0e183b..25f6c7f 100644 Binary files a/images/so-rel.png and b/images/so-rel.png differ diff --git a/pydata_packages.yaml b/pydata_packages.yaml new file mode 100644 index 0000000..ebb44df --- /dev/null +++ b/pydata_packages.yaml @@ -0,0 +1,39 @@ +stackoverflow_tags: + - scikit-learn + - scikits-learn + - scikits + - scikit-image + - scipy + - numpy + - matplotlib + - matplotlib-basemap + - pandas + - networkx + - nltk + - theano + - h5py + - statsmodels + - ipython + - ipython-notebook + - ipython-magic + - ipython-parallel + - python-imaging-library + - pillow +import_names: + - sklearn + - numpy + - PIL + - scipy + - matplotlib + - ipython + - pandas + - networkx + - numexpr + - tables + - patsy + - statsmodels + - sympy + - skimage + - h5py + - nltk + - theano diff --git a/stackoverflow.R b/stackoverflow.R index 96cb0f3..9fc8912 100644 --- a/stackoverflow.R +++ b/stackoverflow.R @@ -1,29 +1,28 @@ library(ggplot2) library(dplyr) +library(readr) # http://data.stackexchange.com/stackoverflow/query/150296/r-and-python-questions -so <- read.csv("http://data.stackexchange.com/stackoverflow/csv/186078") +so <- read_csv("http://data.stackexchange.com/stackoverflow/csv/186078") names(so) <- c("month", "tag", "count") so$month <- as.Date(so$month) +so <- so %>% filter(month > as.Date("2010-01-01")) # too noisy before then # Explosive growth of both python and R tags -qplot(month, count, data = so2, geom = "line", colour = tag) -ggsave("images/so-raw.png", width = 8, height = 6) - -# Explore on log scale -library(MASS) -qplot(month, count, data = so2, geom = "line", colour = tag) + scale_y_log10() -# Growth pretty close to exponential for both -recent <- filter(so2, month > as.Date("2010-01-01")) -ggplot(recent, aes(month, count, group = tag)) + - geom_smooth(method = rlm, se = F, colour = "grey50") + +ggplot(so, aes(month, count)) + + geom_smooth(aes(group = tag), method = MASS::rlm, se = F, colour = "grey70", size = 0.5) + geom_line(aes(colour = tag)) + scale_y_log10() +ggsave("images/so-raw.png", width = 8, height = 6 , dpi = 96) # If we standardise python to 1, we see that R is growing relative to python # over time. -so2 <- so %.% group_by(month) %.% - mutate(rel = count / max(count)) -qplot(month, rel, data = so2, geom = "line", colour = tag) + +rel <- so %>% + group_by(month) %>% + mutate(rel = count / max(count)) %>% + filter(tag == "r") + +ggplot(rel, aes(month, rel)) + + geom_line() + ylab("R questions as proportion of python questions") -ggsave("images/so-rel.png", width = 8, height = 6) +ggsave("images/so-rel.png", width = 8, height = 6, dpi = 96)