import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set_theme()
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

train= pd.read_csv(r"C:\Users\eccel\OneDrive\Documents\onedrive\titanic\train.csv")
train.describe()


test = pd.read_csv(r"C:\Users\eccel\OneDrive\Documents\onedrive\titanic\test.csv")
test.describe()


train


test


#Data Checking
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Data Cleaning
train['Age']=train['Age'].fillna(train['Age'].median())
# train['Died'] = 1 - train['Survived']
train=train[train['Embarked'].notna()]
def getEmbarkedValue(row):
    if row == 'S':
        return 3 
    elif row == 'C':
        return 2
    else:
        return 1


train['Embarked'].fillna('S',inplace=True)
train['Embarked'] = train['Embarked'].apply(lambda x: getEmbarkedValue(x))
test['Embarked'] = test['Embarked'].apply(lambda x: getEmbarkedValue(x))

d:\Python\lib\site-packages\pandas\core\generic.py:6383: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
d:\Python\lib\site-packages\pandas\core\frame.py:3607: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


train['Cabin'].fillna('U', inplace=True)
train['Cabin'].to_string
train['Cabin'] = train['Cabin'].apply(lambda x: x[0])
train['Cabin'].unique()

array(['U', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)


temp_cabinreplace= {
    'T': 0,
    'U': 1,
    'A': 2,
    'G': 3,
    'C': 4,
    'F': 5,
    'B': 6,
    'E': 7,
    'D': 8
}
train['Cabin'] = train['Cabin'].apply(lambda x: temp_cabinreplace.get(x))
train['Cabin'] = StandardScaler().fit_transform(train['Cabin'].values.reshape(-1, 1))
train.head()['Cabin']

d:\Python\lib\site-packages\pandas\core\frame.py:3607: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)

0   -0.491986
1    0.967554
2   -0.491986
3    0.967554
4   -0.491986
Name: Cabin, dtype: float64


train.shape
def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['survived','dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))
    plt.title(feature)

x= ['Pclass','Sex','SibSp','Parch','Embarked']

for i in (x) :
    bar_chart(i)

train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


train.Sex[train.Sex =='male']=0
train.Sex[train.Sex=='female']=1
train.Sex = pd.to_numeric(train.Sex)

def titanic_corr(data):
    correlation = data.corr()
    sns.heatmap(correlation, annot=True, cbar=True, cmap="RdYlGn")
    
titanic_corr(train)

<ipython-input-12-173f5c2fd076>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.Sex[train.Sex =='male']=0
d:\Python\lib\site-packages\pandas\core\generic.py:8861: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
<ipython-input-12-173f5c2fd076>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.Sex[train.Sex=='female']=1


y = train['Survived'].values #target
X = train.drop('Survived', axis=1) #feature
X = X.drop('Embarked', axis=1)
X = X.drop(['PassengerId','Cabin','Ticket','Name','SibSp','Age'], axis=1)
X = X.fillna(value=30)
print(X)

     Pclass  Sex  Parch     Fare
0         3    0      0   7.2500
1         1    1      0  71.2833
2         3    1      0   7.9250
3         1    1      0  53.1000
4         3    0      0   8.0500
..      ...  ...    ...      ...
886       2    0      0  13.0000
887       1    1      0  30.0000
888       3    1      2  23.4500
889       1    0      0  30.0000
890       3    0      0   7.7500

[889 rows x 4 columns]


test.Sex[test.Sex =='male']=0
test.Sex[test.Sex=='female']=1
test.Sex = pd.to_numeric(test.Sex)
test.head()

<ipython-input-14-60bcbf6be31e>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.Sex[test.Sex =='male']=0
<ipython-input-14-60bcbf6be31e>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.Sex[test.Sex=='female']=1


test_x = test.drop('Embarked', axis=1)# => Feature
test_x = test_x.drop(['PassengerId','Cabin','Ticket','Name','SibSp','Age'], axis=1)
test_x = test_x.fillna(value=30)
print (test_x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

     Pclass  Sex  Parch      Fare
0         3    0      0    7.8292
1         3    1      0    7.0000
2         2    0      0    9.6875
3         3    0      0    8.6625
4         3    1      1   12.2875
..      ...  ...    ...       ...
413       3    0      0    8.0500
414       1    1      0  108.9000
415       3    0      0    7.2500
416       3    0      0    8.0500
417       3    0      1   22.3583

[418 rows x 4 columns]


#Using KNN neighbors
neighbors= np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))


for i,k in enumerate(neighbors):
    knn= KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    train_accuracy[i] = knn.score(X_train,y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()


knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test, y_test))
len(y_test)
prediction = knn.predict(test_x)

0.8312236286919831
0.7471910112359551


model = RandomForestClassifier(max_depth=6, random_state=2)
model.fit (X_train,y_train)
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
prediction_1=model.predict(test_x)

0.8734177215189873
0.8146067415730337


new_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
new_df['Survived'] = prediction_1
new_df['PassengerId'] = test.PassengerId

new_df


new_df.to_csv('titanic12_submission.csv', index=False)

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassengerId	Pclass	Age	SibSp	Parch	Fare
count	418.000000	418.000000	332.000000	418.000000	418.000000	417.000000
mean	1100.500000	2.265550	30.272590	0.447368	0.392344	35.627188
std	120.810458	0.841838	14.181209	0.896760	0.981429	55.907576
min	892.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	996.250000	1.000000	21.000000	0.000000	0.000000	7.895800
50%	1100.500000	3.000000	27.000000	0.000000	0.000000	14.454200
75%	1204.750000	3.000000	39.000000	1.000000	0.000000	31.500000
max	1309.000000	3.000000	76.000000	8.000000	9.000000	512.329200

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...
413	1305	3	Spector, Mr. Woolf	male	NaN	0	0	A.5. 3236	8.0500	NaN	S
414	1306	1	Oliva y Ocana, Dona. Fermina	female	39.0	0	0	PC 17758	108.9000	C105	C
415	1307	3	Saether, Mr. Simon Sivertsen	male	38.5	0	0	SOTON/O.Q. 3101262	7.2500	NaN	S
416	1308	3	Ware, Mr. Frederick	male	NaN	0	0	359309	8.0500	NaN	S
417	1309	3	Peter, Master. Michael J	male	NaN	1	1	2668	22.3583	NaN	C

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	0	34.5	0	0	330911	7.8292	NaN	1
1	893	3	Wilkes, Mrs. James (Ellen Needs)	1	47.0	1	0	363272	7.0000	NaN	3
2	894	2	Myles, Mr. Thomas Francis	0	62.0	0	0	240276	9.6875	NaN	1
3	895	3	Wirz, Mr. Albert	0	27.0	0	0	315154	8.6625	NaN	3
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	1	22.0	1	1	3101298	12.2875	NaN	3

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.0000	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.0000	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.4500	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.0000	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.7500	NaN	Q