import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.pyplot as plt
import numpy as np

# Load the uploaded file
file_path = "family_words.csv"
data = pd.read_csv(file_path)

# Extract the words from the file
words = data['Words_Family']

# Vectorize the words using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))
X = vectorizer.fit_transform(words)

# Print the dimensions of X
print("The dimensions of X are:", X.shape)

# Reduce dimensionality for visualization
pca = PCA(n_components=2)
reduced_X = pca.fit_transform(X.toarray())

# Generate linkage matrix for hierarchical clustering
linkage_matrix = linkage(X.toarray(), method='ward')

# Adjust the dendrogram plotting for better scaling and readability
plt.figure(figsize=(16, 12))  # Larger figure size
dendrogram(
    linkage_matrix,
    labels=words.tolist(),
    leaf_rotation=90,  # Rotate labels for better visibility
    leaf_font_size=10,  # Increase font size for readability
    color_threshold=0.7 * max(linkage_matrix[:, 2]),  # Color branches below threshold
)

plt.title('Dendrogram for Hierarchical Clustering', fontsize=16)
plt.xlabel('Words', fontsize=14)
plt.ylabel('Distance', fontsize=14)
plt.tight_layout()  # Ensure labels and plot fit within the figure

# Adjust layout to prevent label truncation
plt.subplots_adjust(bottom=0.3, left=0.1, right=0.9, top=0.9)  # Adjust margins: bottom, left, right, top

plt.show()

# Cut the dendrogram into clusters
n_clusters = 5  # Adjust the number of clusters as needed
clusters = fcluster(linkage_matrix, n_clusters, criterion='maxclust')

# Assign clusters to the data
data['Cluster'] = clusters

# Compute the silhouette score for cluster evaluation
silhouette_avg = silhouette_score(X, clusters, metric='euclidean')
print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg:.2f}")

# Create a silhouette plot for detailed analysis
silhouette_vals = silhouette_samples(X, clusters, metric='euclidean')

plt.figure(figsize=(10, 8))
y_lower = 10
for i in range(1, n_clusters + 1):
    ith_cluster_silhouette_vals = silhouette_vals[clusters == i]
    ith_cluster_silhouette_vals.sort()
    size_cluster_i = ith_cluster_silhouette_vals.shape[0]
    y_upper = y_lower + size_cluster_i

    color = plt.cm.tab10(float(i) / n_clusters)
    plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, f'Cluster {i}', fontsize=10)
    y_lower = y_upper + 10  # Add space between clusters

plt.title('Silhouette Plot for Hierarchical Clustering', fontsize=16)
plt.xlabel('Silhouette Coefficient Values', fontsize=14)
plt.ylabel('Cluster Label', fontsize=14)
plt.axvline(x=silhouette_avg, color="red", linestyle="--", label=f'Average Silhouette Score ({silhouette_avg:.2f})')
plt.legend()
plt.tight_layout()
plt.show()

# Plot words after clustering using PCA for visualization
plt.figure(figsize=(12, 8))
colors = plt.get_cmap('tab10')

for cluster in range(1, n_clusters + 1):
    cluster_indices = (data['Cluster'] == cluster).values
    cluster_points = reduced_X[cluster_indices]
    cluster_words = words[cluster_indices]
    plt.scatter(
        cluster_points[:, 0], 
        cluster_points[:, 1], 
        label=f'Cluster {cluster}', 
        c=[colors(cluster / n_clusters)]
    )
    for i, word in enumerate(cluster_words):
        plt.text(
            cluster_points[i, 0], 
            cluster_points[i, 1], 
            f"{word} ({cluster})", 
            fontsize=8
        )

plt.title('Clustered Words (Hierarchical Clustering)')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.grid(True)
plt.show()

# Save the clustered data for further use
clustered_file_path = "family_words_hierarchical_clustered.csv"
data.to_csv(clustered_file_path, index=False)

# Print the cluster assignments
print("\nCluster assignments:")
print(data[['Words_Family', 'Cluster']])

# Verification: Display the contents of each cluster
for cluster in range(1, n_clusters + 1):
    cluster_words = data[data['Cluster'] == cluster]['Words_Family'].tolist()
    print(f"\nCluster {cluster} contains {len(cluster_words)} words:")
    print(cluster_words)

# Optional: Check for clusters with very few members
for cluster in range(1, n_clusters + 1):
    cluster_size = len(data[data['Cluster'] == cluster])
    if cluster_size < 2:
        print(f"Warning: Cluster {cluster} has very few members ({cluster_size} words). Consider reviewing.")