Web scraping, csv, and command-line tools.

jwasham · jwasham · commit f237ab14fd62 · 2016-10-03T21:33:57.000-07:00
diff --git a/data-science/books.py b/data-science/books.py
@@ -0,0 +1,51 @@
+from bs4 import BeautifulSoup
+import csv
+import pprint
+import re
+import requests
+import time
+
+
+def get_book_data(element):
+    """given a BeautifulSoup Tag representing a book,
+    extract the book's details and return a dict"""
+
+    title = element.find('div', 'thumbheader').a.text
+    by_author = element.find('div', 'AuthorName').text
+    authors = [x.strip()
+               for x in re.sub("by ", '', by_author, flags=re.IGNORECASE).split(',')
+               ]
+    # price = element.find('span', 'price').text.strip()
+
+    return {
+        'title': title,
+        # 'price': price,
+        'authors': authors,
+    }
+
+
+def main():
+    NUM_PAGES = 31
+    books = []
+
+    base_url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page='
+
+    for page_num in range(1, NUM_PAGES + 1):
+        print("souping page", page_num, ",", len(books), " found so far")
+        html = requests.get(base_url + str(page_num)).text
+        soup = BeautifulSoup(html, 'html5lib')
+        books.extend([get_book_data(group) for group in soup('td', 'thumbtext')])
+
+        time.sleep(30)
+
+    with open('books.txt', 'w') as file:
+        writer = csv.writer(file, delimiter=',')
+        writer.writerow(["Title", "Authors"])
+        for book in books:
+            writer.writerow([book['title'], ', '.join(book['authors'])])
+
+    pprint.pprint(books)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data-science/egrep.py b/data-science/egrep.py
@@ -0,0 +1,8 @@
+import re
+import sys
+
+regex = sys.argv[1]
+
+for line in sys.stdin:
+    if re.search(regex, line):
+        sys.stdout.write(line)
diff --git a/data-science/line-count.py b/data-science/line-count.py
@@ -0,0 +1,7 @@
+import sys
+
+count = 0
+for line in sys.stdin:
+    count += 1
+
+print(count)
diff --git a/data-science/most_common_words.py b/data-science/most_common_words.py
@@ -0,0 +1,19 @@
+from collections import Counter
+import sys
+
+try:
+    num_words = int(sys.argv[1])
+except ValueError:
+    print("usage: most_common_words.py num_words")
+    sys.exit(1)
+
+counter = Counter(word.lower()
+                  for line in sys.stdin
+                  for word in line.strip().split()
+                  if word)
+
+for word, count in counter.most_common(num_words):
+    sys.stdout.write(str(count))
+    sys.stdout.write("\t")
+    sys.stdout.write(word)
+    sys.stdout.write("\n")