import pandas as pd
import numpy as np

from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

wine_data = load_wine(as_frame=True)

# Make sure you have Data Wrangling installed to visualize the .frame attribute.
wine_data.frame

# Remove the target column for scaling
X = wine_data.frame.drop(columns=["target"])

scaler = StandardScaler()
wine_data_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns, # preserve column names
    index=X.index      # preserve row indices too
)

wine_data_scaled

pca_full = PCA()
pca_full.fit(wine_data_scaled) # fit PCA on all components

# Get explained variance ratio
explained_var = pca_full.explained_variance_ratio_
print(explained_var)
print(f"Sum of explained variance ratios: {explained_var.sum()}")
print(f"First 2 components explain {explained_var[:2].sum()*100:.2f}% of variance.")
print(f"First 3 components explain {explained_var[:3].sum()*100:.2f}% of variance.")

[0.36198848 0.1920749  0.11123631 0.0706903  0.06563294 0.04935823
 0.04238679 0.02680749 0.02222153 0.01930019 0.01736836 0.01298233
 0.00795215]
Sum of explained variance ratios: 1.0000000000000002
First 2 components explain 55.41% of variance.
First 3 components explain 66.53% of variance.

num_components = len(explained_var)
pca_results_df = pd.DataFrame({
    'Principal Component': range(1, num_components + 1),
    'Explained Variance Ratio': explained_var
})

# Create the scree plot using seaborn.lineplot()
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
plt.figure(figsize=(10, 6))

sns.lineplot(
    data=pca_results_df,
    x='Principal Component',
    y='Explained Variance Ratio',
    marker='o'
)

# Customize the plot
plt.title('Scree Plot', fontsize=16)
plt.xlabel('Principal Component', fontsize=12)
plt.ylabel('Explained Variance Ratio', fontsize=12)
plt.xticks(np.arange(1, num_components + 1, 1)) # Ensure integer ticks on x-axis
plt.show()

pca = PCA(n_components=3)
X_reduced = pca.fit_transform(wine_data_scaled)

y = wine_data.target = wine_data.target

df_pca3 = pd.DataFrame(X_reduced, columns=['PC1', 'PC2', 'PC3'])
df_pca3['target'] = y
df_pca3['class'] = df_pca3['target'].map(dict(enumerate(wine_data.target_names)))

fig = px.scatter_3d(
    df_pca3,
    x='PC1',
    y='PC2',
    z='PC3',
    color='class',
    title='Wine Dataset projected to 3D (PCA)',
    opacity=0.8,
    symbol='class',
    hover_data={'target': True, 'PC1': ':.2f', 'PC2': ':.2f', 'PC3': ':.2f'},
)

# Adjust appearance
fig.update_traces(marker=dict(size=5))
fig.update_layout(scene=dict(
    xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% var)",
    yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% var)",
    zaxis_title=f"PC3 ({pca.explained_variance_ratio_[2]*100:.1f}% var)",
))

fig.show()

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline	target
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0	0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0	0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0	0
3	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0	0
4	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	13.71	5.65	2.45	20.5	95.0	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740.0	2
174	13.40	3.91	2.48	23.0	102.0	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750.0	2
175	13.27	4.28	2.26	20.0	120.0	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835.0	2
176	13.17	2.59	2.37	20.0	120.0	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840.0	2
177	14.13	4.10	2.74	24.5	96.0	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560.0	2

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
0	1.518613	-0.562250	0.232053	-1.169593	1.913905	0.808997	1.034819	-0.659563	1.224884	0.251717	0.362177	1.847920	1.013009
1	0.246290	-0.499413	-0.827996	-2.490847	0.018145	0.568648	0.733629	-0.820719	-0.544721	-0.293321	0.406051	1.113449	0.965242
2	0.196879	0.021231	1.109334	-0.268738	0.088358	0.808997	1.215533	-0.498407	2.135968	0.269020	0.318304	0.788587	1.395148
3	1.691550	-0.346811	0.487926	-0.809251	0.930918	2.491446	1.466525	-0.981875	1.032155	1.186068	-0.427544	1.184071	2.334574
4	0.295700	0.227694	1.840403	0.451946	1.281985	0.808997	0.663351	0.226796	0.401404	-0.319276	0.362177	0.449601	-0.037874
...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	0.876275	2.974543	0.305159	0.301803	-0.332922	-0.985614	-1.424900	1.274310	-0.930179	1.142811	-1.392758	-1.231206	-0.021952
174	0.493343	1.412609	0.414820	1.052516	0.158572	-0.793334	-1.284344	0.549108	-0.316950	0.969783	-1.129518	-1.485445	0.009893
175	0.332758	1.744744	-0.389355	0.151661	1.422412	-1.129824	-1.344582	0.549108	-0.422075	2.224236	-1.612125	-1.485445	0.280575
176	0.209232	0.227694	0.012732	0.151661	1.422412	-1.033684	-1.354622	1.354888	-0.229346	1.834923	-1.568252	-1.400699	0.296498
177	1.395086	1.583165	1.365208	1.502943	-0.262708	-0.392751	-1.274305	1.596623	-0.422075	1.791666	-1.524378	-1.428948	-0.595160

Principal Component Analysis (Using Singular Vector Decomposition)¶

How Many Principal Components Should I Keep?¶

Keep 3 Components¶