- Pandas
- Seaborn
- Matplotlib
- Linear Regression
- Numpy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import missingno as msn
plt.style.use('ggplot')
dataset = pd.read_csv(r'C:\Users\Admin\Desktop\csv files\train_and_test2.csv')
dataset.head(2)
dataset.info()
- The dataset has 1309 columns and their data types have been defined
Copy your dataframe so as to remain with an original copy
df = dataset.copy()
df.columns
Drop unnecessary columns
df = df.drop(['zero', 'zero.1',
'zero.2', 'zero.3', 'zero.4', 'zero.5', 'zero.6',
'zero.7',
'zero.8', 'zero.9', 'zero.10', 'zero.11', 'zero.12', 'zero.13',
'zero.14', 'zero.15', 'zero.16', 'zero.17',
'zero.18' ], axis = 1)
df.head()
df = df.rename(columns = {'2urvived' : 'Survived'})
df.isna().sum()
- Drop rows wit null values since they do not make up to 5%
df = df.loc[~df['Embarked'].isna()].reset_index(drop = True)
df.describe()
- From the above image we have the mean, max, min, std, count... and this is performed only on mumerical columns
- To perform a descriptive analysis on non numerical columns
df.describe(include = 'object')
- Check how your data is related
s = df.corr()
- I used box plots in order to also check for outliers
for i in df.select_dtypes(include = 'number').columns:
sns.boxplot(data = df, x = i)
plt.show()
- I used a histogram
- A histogram is a graph that shows the frequency of numerical data using rectangles.
for i in df.select_dtypes(include = 'number').columns:
sns.histplot(data = df, x = i)
plt.show()
- From above histogram age 28 appeared to be of most passengers
df['Sex'].value_counts()
- Male passengers were onboard more than female passengers
plt.figure(figsize = (18, 6))
plt.subplot(1, 2, 1)
df['Sex'].value_counts().plot(kind = 'bar')
plt.title("Male vs Female Passengers")
plt.ylabel('Number of passengers')
plt.show()
df['Survived'].value_counts()
- More males survived because they were many but overall more females survived
plt.subplot(1, 2, 2)
df['Survived'].value_counts().plot(kind = 'bar')
plt.title('Passengers who died vs survived')
plt.ylabel('NO. of passengers')
plt.show()
plt.figure(figsize = (18, 6))
plt.subplot(1, 2, 1)
sns.countplot(x = df['Sex'], hue = df['Pclass'])
plt.title('Passenger class to Gender')
plt.xlabel('Gender')
plt.ylabel('Number of passengers')
plt.show()
- There were more passengers in third class than in first class and second class
plt.subplot(1, 2, 2)
sns.countplot(x = df['Survived'], hue = df['Pclass'])
plt.title('Survived Passenger to class of Passenger')
plt.xlabel('Survived')
plt.ylabel('Number of passengers')
plt.show()
- Many passengers from third class did not survive
- Most first class passengers survived
plt.figure(figsize = (15, 7))
sns.pointplot(x = 'Pclass', y = 'Age', data = df, linestyles = '--', capsize = .3)
plt.show(
- I used a point plot to show age distribution per class











