{ "cells": [ { "cell_type": "markdown", "id": "05c0a10d", "metadata": {}, "source": [ "## Chapter 6 Altair Data Visualization\n", "Exercise 3" ] }, { "cell_type": "code", "execution_count": 1, "id": "02b4d349", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "import altair as alt" ] }, { "cell_type": "code", "execution_count": 2, "id": "8b2b4ef8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameAuthorUser RatingReviewsPriceYearGenre
010-Day Green Smoothie CleanseJJ Smith4.717350.08.02016.0Non Fiction
111/22/63: A NovelStephen King4.62052.022.02011.0Fiction
212 Rules for Life: An Antidote to ChaosJordan B. Peterson4.718979.015.02018.0Non Fiction
31984 (Signet Classics)George Orwell4.721424.06.02017.0Fiction
45,000 Awesome Facts (About Everything!) (Natio...National Geographic Kids4.87665.012.02019.0Non Fiction
\n", "
" ], "text/plain": [ " Name \\\n", "0 10-Day Green Smoothie Cleanse \n", "1 11/22/63: A Novel \n", "2 12 Rules for Life: An Antidote to Chaos \n", "3 1984 (Signet Classics) \n", "4 5,000 Awesome Facts (About Everything!) (Natio... \n", "\n", " Author User Rating Reviews Price Year Genre \n", "0 JJ Smith 4.7 17350.0 8.0 2016.0 Non Fiction \n", "1 Stephen King 4.6 2052.0 22.0 2011.0 Fiction \n", "2 Jordan B. Peterson 4.7 18979.0 15.0 2018.0 Non Fiction \n", "3 George Orwell 4.7 21424.0 6.0 2017.0 Fiction \n", "4 National Geographic Kids 4.8 7665.0 12.0 2019.0 Non Fiction " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# openpyxl must be installed to read Excel files\n", "# use python -m pip install openpyxl\n", "src_file = Path.cwd() / 'data' / 'raw' / 'AmazonBooks.xlsx'\n", "df = pd.read_excel(src_file)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "8d159ec3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(df).mark_bar().encode(\n", " alt.Y('Year:O', title='Published Year'),\n", " alt.X('sum(Reviews)', title='Number of Reviews'),\n", " color='Genre'\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "id": "386e8e19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(df).mark_rect().encode(\n", " x='Year:O',\n", " y='Genre:O',\n", " color='mean(Price):Q',\n", " tooltip=[alt.Tooltip('mean(Price):Q', format='$.2f'), \n", " alt.Tooltip('count(Name):Q', format='.0f')]\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "id": "61eb75ab", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 600 entries, 0 to 599\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 600 non-null object \n", " 1 Author 600 non-null object \n", " 2 User Rating 600 non-null float64\n", " 3 Reviews 600 non-null float64\n", " 4 Price 600 non-null float64\n", " 5 Year 600 non-null float64\n", " 6 Genre 600 non-null object \n", "dtypes: float64(4), object(3)\n", "memory usage: 32.9+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "c4ad0e3e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Jeff Kinney 13\n", "Suzanne Collins 12\n", "Gary Chapman 12\n", "Rick Riordan 11\n", "American Psychological Association 10\n", " ..\n", "Crispin Boyer 1\n", "Amy Shields 1\n", "Elie Wiesel 1\n", "Mark Owen 1\n", " Tara Westover 1\n", "Name: Author, Length: 275, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Author'].value_counts()" ] }, { "cell_type": "code", "execution_count": 7, "id": "450f927f", "metadata": {}, "outputs": [], "source": [ "top_authors = list(\n", " df.groupby([\"Author\"], as_index=False)\n", " .agg({\"Reviews\": \"sum\"})\n", " .nlargest(20, columns=[\"Reviews\"])[\"Author\"]\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "id": "5247cd8c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Suzanne Collins',\n", " 'Michelle Obama',\n", " 'John Green',\n", " 'Delia Owens',\n", " 'Gary Chapman',\n", " 'E L James',\n", " 'Dr. Seuss',\n", " 'Eric Carle',\n", " 'Gillian Flynn',\n", " 'Paula Hawkins',\n", " 'Laura Hillenbrand',\n", " 'Harper Lee',\n", " 'Don Miguel Ruiz',\n", " 'Dale Carnegie',\n", " 'Sarah Young',\n", " 'Craig Smith',\n", " 'Stephenie Meyer',\n", " 'R. J. Palacio',\n", " 'Kristin Hannah',\n", " 'Mary L. Trump Ph.D. ']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_authors" ] }, { "cell_type": "code", "execution_count": 9, "id": "c84e59f6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(df.query('Author == @top_authors')).mark_circle(opacity=0.8,\n", " stroke='black',\n", " strokeWidth=1\n", ").encode(\n", " alt.Y('Author'),\n", " alt.X('Year:O'),\n", " alt.Size('sum(Reviews)', \n", " scale=alt.Scale(range=[0,500]),\n", " legend=alt.Legend(title='Reviews')),\n", " alt.Color('Author'))" ] }, { "cell_type": "code", "execution_count": 10, "id": "b983fe83", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(df.query('Author == @top_authors')).mark_circle(opacity=0.8,\n", " stroke='black',\n", " strokeWidth=1\n", ").encode(\n", " alt.Y('Author'),\n", " alt.X('Year:O'),\n", " alt.Size('sum(Reviews)', \n", " scale=alt.Scale(range=[0,500]),\n", " legend=alt.Legend(title='Reviews')),\n", " alt.Color('Author', legend=None))" ] }, { "cell_type": "code", "execution_count": 11, "id": "15e2bd4e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(df).mark_circle(opacity=0.8,\n", " stroke='black',\n", " strokeWidth=1\n", ").encode(\n", " alt.Y('Author'),\n", " alt.X('Year:O'),\n", " alt.Size('sum(Reviews)', \n", " scale=alt.Scale(range=[0,900]),\n", " legend=alt.Legend(title='Reviews')),\n", " alt.Color('Author', legend=None)\n", ").configure_axis(\n", " grid=True\n", ").transform_filter(\n", " alt.FieldOneOfPredicate(field='Author', \n", " oneOf=top_authors)\n", ").properties(\n", " width=550,\n", " height=475,\n", " title='Amazon Author Reviews')" ] }, { "cell_type": "code", "execution_count": null, "id": "8f248458", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.11" } }, "nbformat": 4, "nbformat_minor": 5 }