Skip to content

Commit 27e2af0

Browse files
committed
add notebook
1 parent dbfcb1a commit 27e2af0

4 files changed

Lines changed: 1086 additions & 0 deletions

File tree

Lines changed: 389 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,389 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 挑战 25:各国历年二氧化碳 CO2 排放量统计分析"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"## 1. 数据清洁"
15+
]
16+
},
17+
{
18+
"cell_type": "markdown",
19+
"metadata": {},
20+
"source": [
21+
"### 读取数据"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 1,
27+
"metadata": {
28+
"scrolled": true
29+
},
30+
"outputs": [],
31+
"source": [
32+
"import pandas as pd\n",
33+
"\n",
34+
"# 读取数据文件\n",
35+
"df_data = pd.read_excel(\"ClimateChange.xlsx\", sheet_name='Data')\n",
36+
"df_country = pd.read_excel(\"ClimateChange.xlsx\", sheet_name='Country')"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"### 处理 data 数据表"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": 2,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"# 选取 EN.ATM.CO2E.KT 数据,并将国家代码设置为索引\n",
53+
"\n",
54+
"df_data_reindex = df_data[df_data['Series code']== 'EN.ATM.CO2E.KT'].set_index('Country code')"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 3,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"# 剔除不必要的数据列\n",
64+
"df_data_drop = df_data_reindex.drop(labels=['Country name', 'Series code', 'Series name', 'SCALE', 'Decimals'], axis=1)"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 4,
70+
"metadata": {},
71+
"outputs": [],
72+
"source": [
73+
"df_data_nan = df_data_drop.replace({'..': pd.np.NaN})"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": 5,
79+
"metadata": {
80+
"scrolled": true
81+
},
82+
"outputs": [],
83+
"source": [
84+
"# 对 NaN 空值进行向前和向后填充\n",
85+
"df_data_fill = df_data_nan.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 6,
91+
"metadata": {
92+
"scrolled": true
93+
},
94+
"outputs": [],
95+
"source": [
96+
"# 对填充后依旧全部为空值的数据行进行剔除\n",
97+
"df_data_dropna = df_data_fill.dropna(how='all')"
98+
]
99+
},
100+
{
101+
"cell_type": "markdown",
102+
"metadata": {
103+
"scrolled": true
104+
},
105+
"source": [
106+
"### 处理 Country 数据表"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": 7,
112+
"metadata": {},
113+
"outputs": [],
114+
"source": [
115+
"# 将国家代码设置为索引\n",
116+
"df_country_reindex = pd.DataFrame(df_country).set_index('Country code')"
117+
]
118+
},
119+
{
120+
"cell_type": "code",
121+
"execution_count": 8,
122+
"metadata": {},
123+
"outputs": [],
124+
"source": [
125+
"# 剔除不必要的数据列\n",
126+
"df_country_drop = df_country_reindex.drop(labels=['Capital city', 'Region', 'Lending category'], axis=1)"
127+
]
128+
},
129+
{
130+
"cell_type": "markdown",
131+
"metadata": {
132+
"scrolled": true
133+
},
134+
"source": [
135+
"### 合并数据表"
136+
]
137+
},
138+
{
139+
"cell_type": "code",
140+
"execution_count": 9,
141+
"metadata": {},
142+
"outputs": [],
143+
"source": [
144+
"# 对 Data 和 Country 表按照索引进行合并\n",
145+
"df_combine = pd.concat([df_data_dropna, df_country_drop], axis=1, sort=True)"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": 10,
151+
"metadata": {},
152+
"outputs": [],
153+
"source": [
154+
"# 对合并后数据集进行求和得到各国排放总量\n",
155+
"df_combine['Sum emissions'] = df_combine[list(df_combine)[:-2]].sum(axis=1)"
156+
]
157+
},
158+
{
159+
"cell_type": "code",
160+
"execution_count": 11,
161+
"metadata": {
162+
"scrolled": true
163+
},
164+
"outputs": [],
165+
"source": [
166+
"# 对合并后存在空值的数据行进行剔除,得到清洁后的数据集\n",
167+
"df_clean = df_combine.dropna(thresh=10)"
168+
]
169+
},
170+
{
171+
"cell_type": "markdown",
172+
"metadata": {},
173+
"source": [
174+
"## 2. 数据求和整理"
175+
]
176+
},
177+
{
178+
"cell_type": "markdown",
179+
"metadata": {},
180+
"source": [
181+
"### 按收入群体对数据进行求和"
182+
]
183+
},
184+
{
185+
"cell_type": "code",
186+
"execution_count": 12,
187+
"metadata": {},
188+
"outputs": [],
189+
"source": [
190+
"# 按收入群体对数据进行求和\n",
191+
"sum_by_groups = df_clean.groupby('Income group')['Sum emissions'].sum()"
192+
]
193+
},
194+
{
195+
"cell_type": "markdown",
196+
"metadata": {},
197+
"source": [
198+
"### 按要求整理 DataFrame"
199+
]
200+
},
201+
{
202+
"cell_type": "code",
203+
"execution_count": 13,
204+
"metadata": {},
205+
"outputs": [],
206+
"source": [
207+
"# 按要求整理 DataFrame\n",
208+
"item_high_list = []\n",
209+
"item_low_list = []\n",
210+
"\n",
211+
"for group_name in list(sum_by_groups.index):\n",
212+
"\n",
213+
" # 得到各收入群体最高排放量数据\n",
214+
" item_high = df_clean[df_clean['Income group'] == group_name].sort_values(\n",
215+
" by='Sum emissions', ascending=False).iloc[0]\n",
216+
"\n",
217+
" # 将最高排放量数据存入相应列表方便生成最终 DataFrame\n",
218+
" item_high_list.append(\n",
219+
" (item_high['Income group'], item_high['Country name'], item_high['Sum emissions']))\n",
220+
"\n",
221+
" # 得到各收入群体最低排放量数据\n",
222+
" item_low = df_clean[df_clean['Income group'] ==\n",
223+
" group_name].sort_values(by='Sum emissions').iloc[0]\n",
224+
"\n",
225+
" # 将最低排放量数据存入相应列表方便生成最终 DataFrame\n",
226+
" item_low_list.append(\n",
227+
" (item_low['Income group'], item_low['Country name'], item_low['Sum emissions']))"
228+
]
229+
},
230+
{
231+
"cell_type": "markdown",
232+
"metadata": {},
233+
"source": [
234+
"### 合并输出"
235+
]
236+
},
237+
{
238+
"cell_type": "code",
239+
"execution_count": 14,
240+
"metadata": {},
241+
"outputs": [
242+
{
243+
"data": {
244+
"text/html": [
245+
"<div>\n",
246+
"<style scoped>\n",
247+
" .dataframe tbody tr th:only-of-type {\n",
248+
" vertical-align: middle;\n",
249+
" }\n",
250+
"\n",
251+
" .dataframe tbody tr th {\n",
252+
" vertical-align: top;\n",
253+
" }\n",
254+
"\n",
255+
" .dataframe thead th {\n",
256+
" text-align: right;\n",
257+
" }\n",
258+
"</style>\n",
259+
"<table border=\"1\" class=\"dataframe\">\n",
260+
" <thead>\n",
261+
" <tr style=\"text-align: right;\">\n",
262+
" <th></th>\n",
263+
" <th>Sum emissions</th>\n",
264+
" <th>Highest emission country</th>\n",
265+
" <th>Highest emissions</th>\n",
266+
" <th>Lowest emission country</th>\n",
267+
" <th>Lowest emissions</th>\n",
268+
" </tr>\n",
269+
" <tr>\n",
270+
" <th>Income group</th>\n",
271+
" <th></th>\n",
272+
" <th></th>\n",
273+
" <th></th>\n",
274+
" <th></th>\n",
275+
" <th></th>\n",
276+
" </tr>\n",
277+
" </thead>\n",
278+
" <tbody>\n",
279+
" <tr>\n",
280+
" <th>High income: OECD</th>\n",
281+
" <td>2.588373e+08</td>\n",
282+
" <td>United States</td>\n",
283+
" <td>1.179918e+08</td>\n",
284+
" <td>Iceland</td>\n",
285+
" <td>46944.934</td>\n",
286+
" </tr>\n",
287+
" <tr>\n",
288+
" <th>High income: nonOECD</th>\n",
289+
" <td>1.581112e+07</td>\n",
290+
" <td>Saudi Arabia</td>\n",
291+
" <td>7.009760e+06</td>\n",
292+
" <td>Turks and Caicos Islands</td>\n",
293+
" <td>1503.470</td>\n",
294+
" </tr>\n",
295+
" <tr>\n",
296+
" <th>Low income</th>\n",
297+
" <td>5.485641e+06</td>\n",
298+
" <td>Korea, Dem. Rep.</td>\n",
299+
" <td>3.104479e+06</td>\n",
300+
" <td>Comoros</td>\n",
301+
" <td>2068.188</td>\n",
302+
" </tr>\n",
303+
" <tr>\n",
304+
" <th>Lower middle income</th>\n",
305+
" <td>6.272726e+07</td>\n",
306+
" <td>India</td>\n",
307+
" <td>2.681828e+07</td>\n",
308+
" <td>Kiribati</td>\n",
309+
" <td>601.388</td>\n",
310+
" </tr>\n",
311+
" <tr>\n",
312+
" <th>Upper middle income</th>\n",
313+
" <td>2.100775e+08</td>\n",
314+
" <td>China</td>\n",
315+
" <td>9.809777e+07</td>\n",
316+
" <td>Niue</td>\n",
317+
" <td>80.674</td>\n",
318+
" </tr>\n",
319+
" </tbody>\n",
320+
"</table>\n",
321+
"</div>"
322+
],
323+
"text/plain": [
324+
" Sum emissions Highest emission country \\\n",
325+
"Income group \n",
326+
"High income: OECD 2.588373e+08 United States \n",
327+
"High income: nonOECD 1.581112e+07 Saudi Arabia \n",
328+
"Low income 5.485641e+06 Korea, Dem. Rep. \n",
329+
"Lower middle income 6.272726e+07 India \n",
330+
"Upper middle income 2.100775e+08 China \n",
331+
"\n",
332+
" Highest emissions Lowest emission country \\\n",
333+
"Income group \n",
334+
"High income: OECD 1.179918e+08 Iceland \n",
335+
"High income: nonOECD 7.009760e+06 Turks and Caicos Islands \n",
336+
"Low income 3.104479e+06 Comoros \n",
337+
"Lower middle income 2.681828e+07 Kiribati \n",
338+
"Upper middle income 9.809777e+07 Niue \n",
339+
"\n",
340+
" Lowest emissions \n",
341+
"Income group \n",
342+
"High income: OECD 46944.934 \n",
343+
"High income: nonOECD 1503.470 \n",
344+
"Low income 2068.188 \n",
345+
"Lower middle income 601.388 \n",
346+
"Upper middle income 80.674 "
347+
]
348+
},
349+
"execution_count": 14,
350+
"metadata": {},
351+
"output_type": "execute_result"
352+
}
353+
],
354+
"source": [
355+
"# 设置 DataFrame 标签\n",
356+
"high_labels = ['Income group', 'Highest emission country', 'Highest emissions']\n",
357+
"low_labels = ['Income group', 'Lowest emission country', 'Lowest emissions']\n",
358+
"\n",
359+
"# 生成并合并目标 DataFrame\n",
360+
"highest_df = pd.DataFrame.from_records(item_high_list, columns=high_labels).set_index('Income group')\n",
361+
"lowest_df = pd.DataFrame.from_records(item_low_list, columns=low_labels).set_index('Income group')\n",
362+
"\n",
363+
"results = pd.concat([sum_by_groups, highest_df, lowest_df], axis=1)\n",
364+
"results"
365+
]
366+
}
367+
],
368+
"metadata": {
369+
"kernelspec": {
370+
"display_name": "Python 3",
371+
"language": "python",
372+
"name": "python3"
373+
},
374+
"language_info": {
375+
"codemirror_mode": {
376+
"name": "ipython",
377+
"version": 3
378+
},
379+
"file_extension": ".py",
380+
"mimetype": "text/x-python",
381+
"name": "python",
382+
"nbconvert_exporter": "python",
383+
"pygments_lexer": "ipython3",
384+
"version": "3.6.5"
385+
}
386+
},
387+
"nbformat": 4,
388+
"nbformat_minor": 2
389+
}

0 commit comments

Comments
 (0)