import pandas as pd
import scipy
import numpy as np

from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt



# Load the dataset
df = pd.read_csv('train.csv')
print(df.head())


df.info()
print(df.duplicated())
print(df.describe())

# example for plot - age
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['age'], bins = 10, range = (df['age'].min(),df['age'].max()))
plt.title('Age distribution')
plt.xlabel('Age')
plt.ylabel('Count of Passengers')
plt.show()

# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
print('Categorical columns :',cat_col)
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']
print('Numerical columns :',num_col)

print(df[cat_col].nunique())



# Create subplots: 1 plot per column
fig, axs = plt.subplots(nrows=len(num_col), ncols=1, dpi=80, figsize=(10, 6))

# Ensure axs is iterable even if there is only one column
if len(num_col) == 1:
    axs = [axs]

# Plot each numerical column in a separate subplot
for i, column in enumerate(num_col):
    df.boxplot(column=column, ax=axs[i], vert=False)
    axs[i].set_title(column)  # Set the title as the column name

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()


# data cleaning
# Identify the quartiles 
i = 0
for col in num_col:
    q1, q3 = np.percentile(df[col], [25, 75])
# Calculate the interquartile range
    iqr = q3 - q1
# Calculate the lower and upper bounds
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
    clean_data = df[(df[col] >= lower_bound) 
				& (df[col] <= upper_bound)]
    i+=1

print(clean_data.describe())



# Create subplots: 1 plot per column
fig, axs = plt.subplots(nrows=len(num_col), ncols=1, dpi=80, figsize=(10, 6))

# Ensure axs is iterable even if there is only one column
if len(num_col) == 1:
    axs = [axs]

# Plot each numerical column in a separate subplot
for i, column in enumerate(num_col):
    clean_data.boxplot(column=column, ax=axs[i], vert=False)
    axs[i].set_title(column)  # Set the title as the column name

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()





# write raw and clean data to excel file
# Specify a writer
writer = pd.ExcelWriter('train_clean_data.xlsx', engine='xlsxwriter')
# Write your DataFrame to a file     
df.to_excel(writer, 'raw data')
clean_data.to_excel(writer, 'clean data')
# Save the result 
writer.close()

# Determin unique data
print(df['embarked'].unique()) 

# Drop houses where the target is missing 
df.dropna(axis=0, subset=['embarked'], inplace=True) 
target = df.embarked 
print(target.unique()) 

# One hot encoding
one_hot_encoded_target = pd.get_dummies(target) 
print(one_hot_encoded_target)

one_hot_encoded_embarked = pd.get_dummies(df, columns = ['embarked']) 
print(one_hot_encoded_embarked)