Python 3.9.6 (tags/v3.9.6:db3ff76, Jun 28 2021, 15:26:21) [MSC v.1929 64 bit (AMD64)]
Type 'copyright', 'credits' or 'license' for more information
IPython 7.25.0 -- An enhanced Interactive Python. Type '?' for help.
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set_theme()
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
train= pd.read_csv(r"C:\Users\eccel\OneDrive\Documents\onedrive\titanic\train.csv")
train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
test = pd.read_csv(r"C:\Users\eccel\OneDrive\Documents\onedrive\titanic\test.csv")
test.describe()
PassengerId | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
count | 418.000000 | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
mean | 1100.500000 | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
std | 120.810458 | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
25% | 996.250000 | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
50% | 1100.500000 | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1204.750000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
train
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
test
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
413 | 1305 | 3 | Spector, Mr. Woolf | male | NaN | 0 | 0 | A.5. 3236 | 8.0500 | NaN | S |
414 | 1306 | 1 | Oliva y Ocana, Dona. Fermina | female | 39.0 | 0 | 0 | PC 17758 | 108.9000 | C105 | C |
415 | 1307 | 3 | Saether, Mr. Simon Sivertsen | male | 38.5 | 0 | 0 | SOTON/O.Q. 3101262 | 7.2500 | NaN | S |
416 | 1308 | 3 | Ware, Mr. Frederick | male | NaN | 0 | 0 | 359309 | 8.0500 | NaN | S |
417 | 1309 | 3 | Peter, Master. Michael J | male | NaN | 1 | 1 | 2668 | 22.3583 | NaN | C |
418 rows × 11 columns
#Data Checking
train.isnull().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
## Data Cleaning
train['Age']=train['Age'].fillna(train['Age'].median())
# train['Died'] = 1 - train['Survived']
train=train[train['Embarked'].notna()]
def getEmbarkedValue(row):
if row == 'S':
return 3
elif row == 'C':
return 2
else:
return 1
train['Embarked'].fillna('S',inplace=True)
train['Embarked'] = train['Embarked'].apply(lambda x: getEmbarkedValue(x))
test['Embarked'] = test['Embarked'].apply(lambda x: getEmbarkedValue(x))
d:\Python\lib\site-packages\pandas\core\generic.py:6383: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return self._update_inplace(result) d:\Python\lib\site-packages\pandas\core\frame.py:3607: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._set_item(key, value)
train.isnull().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 0 dtype: int64
train['Cabin'].fillna('U', inplace=True)
train['Cabin'].to_string
train['Cabin'] = train['Cabin'].apply(lambda x: x[0])
train['Cabin'].unique()
array(['U', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)
temp_cabinreplace= {
'T': 0,
'U': 1,
'A': 2,
'G': 3,
'C': 4,
'F': 5,
'B': 6,
'E': 7,
'D': 8
}
train['Cabin'] = train['Cabin'].apply(lambda x: temp_cabinreplace.get(x))
train['Cabin'] = StandardScaler().fit_transform(train['Cabin'].values.reshape(-1, 1))
train.head()['Cabin']
d:\Python\lib\site-packages\pandas\core\frame.py:3607: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._set_item(key, value)
0 -0.491986 1 0.967554 2 -0.491986 3 0.967554 4 -0.491986 Name: Cabin, dtype: float64
train.shape
def bar_chart(feature):
survived = train[train['Survived']==1][feature].value_counts()
dead = train[train['Survived']==0][feature].value_counts()
df = pd.DataFrame([survived,dead])
df.index = ['survived','dead']
df.plot(kind='bar',stacked=True, figsize=(10,5))
plt.title(feature)
x= ['Pclass','Sex','SibSp','Parch','Embarked']
for i in (x) :
bar_chart(i)
train.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
train.Sex[train.Sex =='male']=0
train.Sex[train.Sex=='female']=1
train.Sex = pd.to_numeric(train.Sex)
def titanic_corr(data):
correlation = data.corr()
sns.heatmap(correlation, annot=True, cbar=True, cmap="RdYlGn")
titanic_corr(train)
<ipython-input-12-173f5c2fd076>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train.Sex[train.Sex =='male']=0 d:\Python\lib\site-packages\pandas\core\generic.py:8861: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return self._update_inplace(result) <ipython-input-12-173f5c2fd076>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train.Sex[train.Sex=='female']=1
y = train['Survived'].values #target
X = train.drop('Survived', axis=1) #feature
X = X.drop('Embarked', axis=1)
X = X.drop(['PassengerId','Cabin','Ticket','Name','SibSp','Age'], axis=1)
X = X.fillna(value=30)
print(X)
Pclass Sex Parch Fare 0 3 0 0 7.2500 1 1 1 0 71.2833 2 3 1 0 7.9250 3 1 1 0 53.1000 4 3 0 0 8.0500 .. ... ... ... ... 886 2 0 0 13.0000 887 1 1 0 30.0000 888 3 1 2 23.4500 889 1 0 0 30.0000 890 3 0 0 7.7500 [889 rows x 4 columns]
test.Sex[test.Sex =='male']=0
test.Sex[test.Sex=='female']=1
test.Sex = pd.to_numeric(test.Sex)
test.head()
<ipython-input-14-60bcbf6be31e>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test.Sex[test.Sex =='male']=0 <ipython-input-14-60bcbf6be31e>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test.Sex[test.Sex=='female']=1
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | 0 | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | 1 |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | 1 | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | 3 |
2 | 894 | 2 | Myles, Mr. Thomas Francis | 0 | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | 1 |
3 | 895 | 3 | Wirz, Mr. Albert | 0 | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | 3 |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | 1 | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | 3 |
test_x = test.drop('Embarked', axis=1)# => Feature
test_x = test_x.drop(['PassengerId','Cabin','Ticket','Name','SibSp','Age'], axis=1)
test_x = test_x.fillna(value=30)
print (test_x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)
Pclass Sex Parch Fare 0 3 0 0 7.8292 1 3 1 0 7.0000 2 2 0 0 9.6875 3 3 0 0 8.6625 4 3 1 1 12.2875 .. ... ... ... ... 413 3 0 0 8.0500 414 1 1 0 108.9000 415 3 0 0 7.2500 416 3 0 0 8.0500 417 3 0 1 22.3583 [418 rows x 4 columns]
#Using KNN neighbors
neighbors= np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i,k in enumerate(neighbors):
knn= KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,y_train)
train_accuracy[i] = knn.score(X_train,y_train)
test_accuracy[i] = knn.score(X_test, y_test)
# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test, y_test))
len(y_test)
prediction = knn.predict(test_x)
0.8312236286919831 0.7471910112359551
model = RandomForestClassifier(max_depth=6, random_state=2)
model.fit (X_train,y_train)
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
prediction_1=model.predict(test_x)
0.8734177215189873 0.8146067415730337
new_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
new_df['Survived'] = prediction_1
new_df['PassengerId'] = test.PassengerId
new_df
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 1 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 1 |
... | ... | ... |
413 | 1305 | 0 |
414 | 1306 | 1 |
415 | 1307 | 0 |
416 | 1308 | 0 |
417 | 1309 | 0 |
418 rows × 2 columns
new_df.to_csv('titanic12_submission.csv', index=False)