Python 3.9.6 (tags/v3.9.6:db3ff76, Jun 28 2021, 15:26:21) [MSC v.1929 64 bit (AMD64)]
Type 'copyright', 'credits' or 'license' for more information
IPython 7.25.0 -- An enhanced Interactive Python. Type '?' for help.

In [ ]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set_theme()
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

train= pd.read_csv(r"C:\Users\eccel\OneDrive\Documents\onedrive\titanic\train.csv")
train.describe()
Out[ ]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [ ]:
test = pd.read_csv(r"C:\Users\eccel\OneDrive\Documents\onedrive\titanic\test.csv")
test.describe()
Out[ ]:
PassengerId Pclass Age SibSp Parch Fare
count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000
mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188
std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576
min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800
50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200
75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000
max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200
In [ ]:
train
Out[ ]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [ ]:
test
Out[ ]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf male NaN 0 0 A.5. 3236 8.0500 NaN S
414 1306 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 108.9000 C105 C
415 1307 3 Saether, Mr. Simon Sivertsen male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S
416 1308 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C

418 rows × 11 columns

In [ ]:
#Data Checking
train.isnull().sum()
Out[ ]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
In [ ]:
## Data Cleaning
train['Age']=train['Age'].fillna(train['Age'].median())
# train['Died'] = 1 - train['Survived']
train=train[train['Embarked'].notna()]
def getEmbarkedValue(row):
    if row == 'S':
        return 3 
    elif row == 'C':
        return 2
    else:
        return 1
In [ ]:
train['Embarked'].fillna('S',inplace=True)
train['Embarked'] = train['Embarked'].apply(lambda x: getEmbarkedValue(x))
test['Embarked'] = test['Embarked'].apply(lambda x: getEmbarkedValue(x))
d:\Python\lib\site-packages\pandas\core\generic.py:6383: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
d:\Python\lib\site-packages\pandas\core\frame.py:3607: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
In [ ]:
train.isnull().sum()
Out[ ]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
In [ ]:
train['Cabin'].fillna('U', inplace=True)
train['Cabin'].to_string
train['Cabin'] = train['Cabin'].apply(lambda x: x[0])
train['Cabin'].unique()
Out[ ]:
array(['U', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)
In [ ]:
temp_cabinreplace= {
    'T': 0,
    'U': 1,
    'A': 2,
    'G': 3,
    'C': 4,
    'F': 5,
    'B': 6,
    'E': 7,
    'D': 8
}
train['Cabin'] = train['Cabin'].apply(lambda x: temp_cabinreplace.get(x))
train['Cabin'] = StandardScaler().fit_transform(train['Cabin'].values.reshape(-1, 1))
train.head()['Cabin']
d:\Python\lib\site-packages\pandas\core\frame.py:3607: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
Out[ ]:
0   -0.491986
1    0.967554
2   -0.491986
3    0.967554
4   -0.491986
Name: Cabin, dtype: float64
In [ ]:
train.shape
def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['survived','dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))
    plt.title(feature)

x= ['Pclass','Sex','SibSp','Parch','Embarked']

for i in (x) :
    bar_chart(i)

train.columns
Out[ ]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
In [ ]:
train.Sex[train.Sex =='male']=0
train.Sex[train.Sex=='female']=1
train.Sex = pd.to_numeric(train.Sex)

def titanic_corr(data):
    correlation = data.corr()
    sns.heatmap(correlation, annot=True, cbar=True, cmap="RdYlGn")
    
titanic_corr(train)
<ipython-input-12-173f5c2fd076>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.Sex[train.Sex =='male']=0
d:\Python\lib\site-packages\pandas\core\generic.py:8861: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
<ipython-input-12-173f5c2fd076>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.Sex[train.Sex=='female']=1
In [ ]:
y = train['Survived'].values #target
X = train.drop('Survived', axis=1) #feature
X = X.drop('Embarked', axis=1)
X = X.drop(['PassengerId','Cabin','Ticket','Name','SibSp','Age'], axis=1)
X = X.fillna(value=30)
print(X)
     Pclass  Sex  Parch     Fare
0         3    0      0   7.2500
1         1    1      0  71.2833
2         3    1      0   7.9250
3         1    1      0  53.1000
4         3    0      0   8.0500
..      ...  ...    ...      ...
886       2    0      0  13.0000
887       1    1      0  30.0000
888       3    1      2  23.4500
889       1    0      0  30.0000
890       3    0      0   7.7500

[889 rows x 4 columns]
In [ ]:
test.Sex[test.Sex =='male']=0
test.Sex[test.Sex=='female']=1
test.Sex = pd.to_numeric(test.Sex)
test.head()
<ipython-input-14-60bcbf6be31e>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.Sex[test.Sex =='male']=0
<ipython-input-14-60bcbf6be31e>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.Sex[test.Sex=='female']=1
Out[ ]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James 0 34.5 0 0 330911 7.8292 NaN 1
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 47.0 1 0 363272 7.0000 NaN 3
2 894 2 Myles, Mr. Thomas Francis 0 62.0 0 0 240276 9.6875 NaN 1
3 895 3 Wirz, Mr. Albert 0 27.0 0 0 315154 8.6625 NaN 3
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 22.0 1 1 3101298 12.2875 NaN 3
In [ ]:
test_x = test.drop('Embarked', axis=1)# => Feature
test_x = test_x.drop(['PassengerId','Cabin','Ticket','Name','SibSp','Age'], axis=1)
test_x = test_x.fillna(value=30)
print (test_x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)
     Pclass  Sex  Parch      Fare
0         3    0      0    7.8292
1         3    1      0    7.0000
2         2    0      0    9.6875
3         3    0      0    8.6625
4         3    1      1   12.2875
..      ...  ...    ...       ...
413       3    0      0    8.0500
414       1    1      0  108.9000
415       3    0      0    7.2500
416       3    0      0    8.0500
417       3    0      1   22.3583

[418 rows x 4 columns]
In [ ]:
#Using KNN neighbors
neighbors= np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
In [ ]:
for i,k in enumerate(neighbors):
    knn= KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    train_accuracy[i] = knn.score(X_train,y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
In [ ]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test, y_test))
len(y_test)
prediction = knn.predict(test_x)
0.8312236286919831
0.7471910112359551
In [ ]:
model = RandomForestClassifier(max_depth=6, random_state=2)
model.fit (X_train,y_train)
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
prediction_1=model.predict(test_x)
0.8734177215189873
0.8146067415730337
In [ ]:
new_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
new_df['Survived'] = prediction_1
new_df['PassengerId'] = test.PassengerId

new_df
Out[ ]:
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
... ... ...
413 1305 0
414 1306 1
415 1307 0
416 1308 0
417 1309 0

418 rows × 2 columns

In [ ]:
new_df.to_csv('titanic12_submission.csv', index=False)