yesoncode
diff --git a/‎week7-1-co2-emissions-analysis/挑战 25：各国历年二氧化碳 CO2 排放量统计分析.ipynb‎
Lines changed: 389 additions & 0 deletions b/‎week7-1-co2-emissions-analysis/挑战 25：各国历年二氧化碳 CO2 排放量统计分析.ipynb‎
Lines changed: 389 additions & 0 deletions
@@ -0,0 +1,389 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 挑战 25：各国历年二氧化碳 CO2 排放量统计分析"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 数据清洁"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 读取数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# 读取数据文件\n",
+    "df_data = pd.read_excel(\"ClimateChange.xlsx\", sheet_name='Data')\n",
+    "df_country = pd.read_excel(\"ClimateChange.xlsx\", sheet_name='Country')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 处理 data 数据表"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 选取 EN.ATM.CO2E.KT 数据，并将国家代码设置为索引\n",
+    "\n",
+    "df_data_reindex = df_data[df_data['Series code']== 'EN.ATM.CO2E.KT'].set_index('Country code')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 剔除不必要的数据列\n",
+    "df_data_drop = df_data_reindex.drop(labels=['Country name', 'Series code', 'Series name', 'SCALE', 'Decimals'], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_data_nan = df_data_drop.replace({'..': pd.np.NaN})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# 对 NaN 空值进行向前和向后填充\n",
+    "df_data_fill = df_data_nan.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# 对填充后依旧全部为空值的数据行进行剔除\n",
+    "df_data_dropna = df_data_fill.dropna(how='all')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "scrolled": true
+   },
+   "source": [
+    "### 处理 Country 数据表"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 将国家代码设置为索引\n",
+    "df_country_reindex = pd.DataFrame(df_country).set_index('Country code')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 剔除不必要的数据列\n",
+    "df_country_drop = df_country_reindex.drop(labels=['Capital city', 'Region', 'Lending category'], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "scrolled": true
+   },
+   "source": [
+    "### 合并数据表"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 对 Data 和 Country 表按照索引进行合并\n",
+    "df_combine = pd.concat([df_data_dropna, df_country_drop], axis=1, sort=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 对合并后数据集进行求和得到各国排放总量\n",
+    "df_combine['Sum emissions'] = df_combine[list(df_combine)[:-2]].sum(axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# 对合并后存在空值的数据行进行剔除，得到清洁后的数据集\n",
+    "df_clean = df_combine.dropna(thresh=10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 数据求和整理"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 按收入群体对数据进行求和"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 按收入群体对数据进行求和\n",
+    "sum_by_groups = df_clean.groupby('Income group')['Sum emissions'].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 按要求整理 DataFrame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 按要求整理 DataFrame\n",
+    "item_high_list = []\n",
+    "item_low_list = []\n",
+    "\n",
+    "for group_name in list(sum_by_groups.index):\n",
+    "\n",
+    "    # 得到各收入群体最高排放量数据\n",
+    "    item_high = df_clean[df_clean['Income group'] == group_name].sort_values(\n",
+    "        by='Sum emissions', ascending=False).iloc[0]\n",
+    "\n",
+    "    # 将最高排放量数据存入相应列表方便生成最终 DataFrame\n",
+    "    item_high_list.append(\n",
+    "        (item_high['Income group'], item_high['Country name'], item_high['Sum emissions']))\n",
+    "\n",
+    "    # 得到各收入群体最低排放量数据\n",
+    "    item_low = df_clean[df_clean['Income group'] ==\n",
+    "                        group_name].sort_values(by='Sum emissions').iloc[0]\n",
+    "\n",
+    "    # 将最低排放量数据存入相应列表方便生成最终 DataFrame\n",
+    "    item_low_list.append(\n",
+    "        (item_low['Income group'], item_low['Country name'], item_low['Sum emissions']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 合并输出"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Sum emissions</th>\n",
+       "      <th>Highest emission country</th>\n",
+       "      <th>Highest emissions</th>\n",
+       "      <th>Lowest emission country</th>\n",
+       "      <th>Lowest emissions</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Income group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>High income: OECD</th>\n",
+       "      <td>2.588373e+08</td>\n",
+       "      <td>United States</td>\n",
+       "      <td>1.179918e+08</td>\n",
+       "      <td>Iceland</td>\n",
+       "      <td>46944.934</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>High income: nonOECD</th>\n",
+       "      <td>1.581112e+07</td>\n",
+       "      <td>Saudi Arabia</td>\n",
+       "      <td>7.009760e+06</td>\n",
+       "      <td>Turks and Caicos Islands</td>\n",
+       "      <td>1503.470</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Low income</th>\n",
+       "      <td>5.485641e+06</td>\n",
+       "      <td>Korea, Dem. Rep.</td>\n",
+       "      <td>3.104479e+06</td>\n",
+       "      <td>Comoros</td>\n",
+       "      <td>2068.188</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Lower middle income</th>\n",
+       "      <td>6.272726e+07</td>\n",
+       "      <td>India</td>\n",
+       "      <td>2.681828e+07</td>\n",
+       "      <td>Kiribati</td>\n",
+       "      <td>601.388</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Upper middle income</th>\n",
+       "      <td>2.100775e+08</td>\n",
+       "      <td>China</td>\n",
+       "      <td>9.809777e+07</td>\n",
+       "      <td>Niue</td>\n",
+       "      <td>80.674</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      Sum emissions Highest emission country  \\\n",
+       "Income group                                                   \n",
+       "High income: OECD      2.588373e+08            United States   \n",
+       "High income: nonOECD   1.581112e+07             Saudi Arabia   \n",
+       "Low income             5.485641e+06         Korea, Dem. Rep.   \n",
+       "Lower middle income    6.272726e+07                    India   \n",
+       "Upper middle income    2.100775e+08                    China   \n",
+       "\n",
+       "                      Highest emissions   Lowest emission country  \\\n",
+       "Income group                                                        \n",
+       "High income: OECD          1.179918e+08                   Iceland   \n",
+       "High income: nonOECD       7.009760e+06  Turks and Caicos Islands   \n",
+       "Low income                 3.104479e+06                   Comoros   \n",
+       "Lower middle income        2.681828e+07                  Kiribati   \n",
+       "Upper middle income        9.809777e+07                      Niue   \n",
+       "\n",
+       "                      Lowest emissions  \n",
+       "Income group                            \n",
+       "High income: OECD            46944.934  \n",
+       "High income: nonOECD          1503.470  \n",
+       "Low income                    2068.188  \n",
+       "Lower middle income            601.388  \n",
+       "Upper middle income             80.674  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 设置 DataFrame 标签\n",
+    "high_labels = ['Income group', 'Highest emission country', 'Highest emissions']\n",
+    "low_labels = ['Income group', 'Lowest emission country', 'Lowest emissions']\n",
+    "\n",
+    "# 生成并合并目标 DataFrame\n",
+    "highest_df = pd.DataFrame.from_records(item_high_list, columns=high_labels).set_index('Income group')\n",
+    "lowest_df = pd.DataFrame.from_records(item_low_list, columns=low_labels).set_index('Income group')\n",
+    "\n",
+    "results = pd.concat([sum_by_groups, highest_df, lowest_df], axis=1)\n",
+    "results"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}