diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aa6bff3ad1da..282133c35a7b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -72,6 +72,12 @@ That's it! The plugin will run every time you commit any changes. If there are a ```bash pre-commit run --all-files --show-diff-on-failure ``` +### Skipping Pre-commit Hooks for Exploratory Changes + +When you're making local experimental changes and don't want pre-commit hooks to run, you can bypass them using: + +```bash +git commit --no-verify -m "Exploratory: Testing XYZ" #### Coding Style diff --git a/DIRECTORY.md b/DIRECTORY.md index ca454bd5fd82..c6f11be43f48 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -631,6 +631,7 @@ * [Similarity Search](machine_learning/similarity_search.py) * [Support Vector Machines](machine_learning/support_vector_machines.py) * [T Stochastic Neighbour Embedding](machine_learning/t_stochastic_neighbour_embedding.py) + * [Uniform Manifold Approximation And Projection](machine_learning/uniform_manifold_approximation_and_projection.py) * [Word Frequency Functions](machine_learning/word_frequency_functions.py) * [Xgboost Classifier](machine_learning/xgboost_classifier.py) * [Xgboost Regressor](machine_learning/xgboost_regressor.py) diff --git a/machine_learning/uniform_manifold_approximation_and_projection.py b/machine_learning/uniform_manifold_approximation_and_projection.py new file mode 100644 index 000000000000..4195ed2e041e --- /dev/null +++ b/machine_learning/uniform_manifold_approximation_and_projection.py @@ -0,0 +1,419 @@ +""" +Uniform Manifold Approximation and Projection (UMAP) +===================================================== + +UMAP is a dimensionality reduction technique — it takes high-dimensional data +(e.g. 4 features in Iris) and compresses it into 2D or 3D so we can visualize +and understand it. + +Think of it like this: + - Imagine 150 students, each described by 4 exam scores. + - UMAP "maps" them onto a 2D sheet of paper so that students who scored + similarly end up near each other on the sheet. + +How UMAP works (simplified, educational version): + Step 1 — Compute pairwise distances between all data points. + Step 2 — For each point, find its k nearest neighbours. + Step 3 — Build a "fuzzy" graph: nearby neighbours get high membership + strength, faraway points get low strength. + Step 4 — Start with a random 2D layout, then iteratively pull neighbours + closer and push non-neighbours further apart. + +This implementation is intentionally simplified for learning purposes. +It does NOT replicate the full UMAP paper math, but faithfully captures +the core ideas using only NumPy and scikit-learn. + +References: + https://umap-learn.readthedocs.io/en/latest/how_umap_works.html + https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction + McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold + Approximation and Projection for Dimension Reduction. arXiv:1802.03426. +""" + +import doctest + +import numpy as np +from numpy import ndarray +from sklearn.datasets import load_iris + +# --------------------------------------------------------------------------- +# Step 0 — Load dataset +# --------------------------------------------------------------------------- + + +def collect_dataset() -> tuple[ndarray, ndarray]: + """ + Load the Iris dataset and return features and class labels. + + The Iris dataset has 150 flowers, each described by 4 measurements: + sepal length, sepal width, petal length, petal width. + + Returns: + tuple[ndarray, ndarray]: + - features : shape (150, 4) — the 4 measurements per flower + - targets : shape (150,) — flower species (0, 1, or 2) + + >>> features, targets = collect_dataset() + >>> features.shape + (150, 4) + >>> targets.shape + (150,) + >>> int(targets.min()), int(targets.max()) + (0, 2) + """ + iris = load_iris() + return np.array(iris.data), np.array(iris.target) + + +# --------------------------------------------------------------------------- +# Step 1 — Pairwise Euclidean distances +# --------------------------------------------------------------------------- + + +def compute_pairwise_distances(data: ndarray) -> ndarray: + """ + Compute the Euclidean distance between every pair of data points. + + For n points in d-dimensional space this returns an (n x n) matrix + where entry [i, j] is the straight-line distance between point i and j. + + The diagonal is 0 because a point has zero distance to itself. + + Args: + data: Array of shape (n_samples, n_features). + + Returns: + ndarray: Symmetric distance matrix of shape (n_samples, n_samples). + + >>> pts = np.array([[0.0, 0.0], [3.0, 4.0]]) + >>> d = compute_pairwise_distances(pts) + >>> float(round(d[0, 1], 4)) + 5.0 + >>> float(d[0, 0]) + 0.0 + """ + # ||a - b||^2 = ||a||^2 + ||b||^2 - 2*a·b + squared_norms = np.sum(data**2, axis=1) # shape (n,) + squared_dists = ( + squared_norms[:, np.newaxis] + - 2.0 * data @ data.T + + squared_norms[np.newaxis, :] + ) + # Numerical noise can make tiny values negative — clamp to 0 + squared_dists = np.maximum(squared_dists, 0.0) + return np.sqrt(squared_dists) + + +# --------------------------------------------------------------------------- +# Step 2 — k-Nearest Neighbours +# --------------------------------------------------------------------------- + + +def find_nearest_neighbors(distance_matrix: ndarray, n_neighbors: int) -> ndarray: + """ + For every point, find the indices of its k closest neighbours. + + UMAP's core idea: the structure of the data is captured by *who is + close to whom*, not the exact distances. + + Args: + distance_matrix: Symmetric (n x n) distance matrix. + n_neighbors: How many neighbours to keep per point (k). + + Returns: + ndarray: Integer array of shape (n_samples, n_neighbors). + Row i contains the indices of point i's k nearest neighbours + (not including itself). + + >>> dist = np.array([[0., 1., 2.], [1., 0., 3.], [2., 3., 0.]]) + >>> nn = find_nearest_neighbors(dist, n_neighbors=1) + >>> nn.tolist() + [[1], [0], [0]] + """ + n_samples = distance_matrix.shape[0] + neighbor_indices = np.zeros((n_samples, n_neighbors), dtype=int) + + for i in range(n_samples): + # argsort gives indices that would sort the row (ascending) + # skip index 0 because that is the point itself (distance = 0) + sorted_indices = np.argsort(distance_matrix[i]) + neighbor_indices[i] = sorted_indices[1 : n_neighbors + 1] + + return neighbor_indices + + +# --------------------------------------------------------------------------- +# Step 3 — Fuzzy membership strengths (the "fuzzy graph") +# --------------------------------------------------------------------------- + + +def compute_membership_strengths( + distance_matrix: ndarray, + neighbor_indices: ndarray, +) -> ndarray: + """ + Assign a "how strongly connected?" weight to every (point, neighbour) pair. + + In the original UMAP paper this comes from constructing a fuzzy simplicial + set. Here we use a simpler but conceptually equivalent idea: + + strength(i, j) = exp( -(dist(i,j) - rho_i) / sigma_i ) + + where + rho_i = distance to the nearest neighbour of i (local scale) + sigma_i = mean distance to all k neighbours of i (bandwidth) + + Points that are very close get strength ≈ 1. + Points that are far apart get strength ≈ 0. + + The result is a sparse (n x n) weight matrix. Only neighbour entries + are non-zero; the rest stay 0. + + Args: + distance_matrix: (n x n) Euclidean distance matrix. + neighbor_indices: (n x k) nearest-neighbour index array. + + Returns: + ndarray: Weight matrix of shape (n_samples, n_samples). + + >>> dist = np.array([[0., 1., 2.], [1., 0., 3.], [2., 3., 0.]]) + >>> nn = find_nearest_neighbors(dist, n_neighbors=1) + >>> W = compute_membership_strengths(dist, nn) + >>> W.shape + (3, 3) + >>> float(round(W[0, 1], 4)) # nearest neighbour → strength = 1.0 + 1.0 + """ + n_samples = distance_matrix.shape[0] + weights = np.zeros((n_samples, n_samples)) + + for i in range(n_samples): + nbr_idx = neighbor_indices[i] # indices of k neighbours + nbr_dists = distance_matrix[i, nbr_idx] # distances to those neighbours + + rho_i = nbr_dists.min() # nearest-neighbour distance + sigma_i = nbr_dists.mean() + 1e-10 # average distance (bandwidth) + + for j_pos, j in enumerate(nbr_idx): + d = nbr_dists[j_pos] + strength = np.exp(-(d - rho_i) / sigma_i) + weights[i, j] = strength + + # Symmetrise: if i→j OR j→i has an edge, keep the max weight + weights = np.maximum(weights, weights.T) + return weights + + +# --------------------------------------------------------------------------- +# Step 4 — Optimise the 2-D layout (attractive + repulsive forces) +# --------------------------------------------------------------------------- + + +def optimize_embedding( + weight_matrix: ndarray, + n_components: int = 2, + n_epochs: int = 200, + learning_rate: float = 1.0, + repulsion_strength: float = 1.0, + random_state: int = 42, +) -> ndarray: + """ + Place points in low-dimensional space so that the fuzzy graph structure + is preserved as faithfully as possible. + + Optimisation idea (force-directed layout): + • Connected pairs → attract each other (pull together) + • All other pairs → repel each other (push apart) + + Each iteration nudges every point slightly in the direction that makes + connected neighbours closer and disconnected points further away. + + Args: + weight_matrix: (n x n) fuzzy membership strengths. + n_components: Target number of output dimensions (usually 2). + n_epochs: Number of optimisation passes. + learning_rate: Step size for each gradient update. + repulsion_strength: How strongly non-neighbours are pushed apart. + random_state: Seed for reproducibility. + + Returns: + ndarray: Embedding of shape (n_samples, n_components). + + >>> rng = np.random.default_rng(0) + >>> W = rng.random((5, 5)) + >>> W = (W + W.T) / 2 # make symmetric + >>> emb = optimize_embedding(W, n_components=2, n_epochs=5) + >>> emb.shape + (5, 2) + """ + rng = np.random.default_rng(random_state) + n_samples = weight_matrix.shape[0] + + # Initialise embedding randomly in a small range + embedding = rng.uniform(low=-10.0, high=10.0, size=(n_samples, n_components)) + + for epoch in range(n_epochs): + # Decay learning rate gently over time + alpha = learning_rate * (1.0 - epoch / n_epochs) + + for i in range(n_samples): + for j in range(n_samples): + if i == j: + continue + + diff = embedding[i] - embedding[j] + dist2 = np.dot(diff, diff) + 1e-6 # squared distance + epsilon + + w = weight_matrix[i, j] + + if w > 0: + # --- Attractive force --- + # Pull i toward j proportional to edge weight + grad = -2.0 * w * diff / (dist2 + 1.0) + else: + # --- Repulsive force --- + # Push i away from j + grad = 2.0 * repulsion_strength * diff / (dist2 * (0.1 + dist2)) + + embedding[i] += alpha * grad + + return embedding + + +# --------------------------------------------------------------------------- +# Public API — apply_umap() +# --------------------------------------------------------------------------- + + +def apply_umap( + data: ndarray, + n_components: int = 2, + n_neighbors: int = 15, + n_epochs: int = 200, + learning_rate: float = 0.01, + random_state: int = 42, +) -> ndarray: + """ + Apply the full (educational) UMAP pipeline to reduce data dimensionality. + + This chains all four steps: + collect distances → find neighbours → build fuzzy graph → optimise layout + + Args: + data: Input data of shape (n_samples, n_features). + n_components: Number of output dimensions (default 2). + n_neighbors: Number of nearest neighbours to consider (default 15). + Smaller k → more local structure. + Larger k → more global structure. + n_epochs: Optimisation iterations (more = better but slower). + learning_rate: Step size for embedding updates. + random_state: Seed for reproducibility. + + Returns: + ndarray: Reduced embedding of shape (n_samples, n_components). + + >>> features, _ = collect_dataset() + >>> embedding = apply_umap(features, n_components=2, n_epochs=5) + >>> embedding.shape + (150, 2) + """ + if n_components < 1: + raise ValueError("n_components must be >= 1") + if n_neighbors < 1: + raise ValueError("n_neighbors must be >= 1") + if n_epochs < 1: + raise ValueError("n_epochs must be >= 1") + + # Step 1 — Distances + dist_matrix = compute_pairwise_distances(data) + + # Step 2 — Nearest neighbours + nn_indices = find_nearest_neighbors(dist_matrix, n_neighbors) + + # Step 3 — Fuzzy graph + weights = compute_membership_strengths(dist_matrix, nn_indices) + + # Step 4 — Optimise layout + embedding = optimize_embedding( + weights, + n_components=n_components, + n_epochs=n_epochs, + learning_rate=learning_rate, + random_state=random_state, + ) + + return embedding + + +# --------------------------------------------------------------------------- +# main() +# --------------------------------------------------------------------------- + + +def main() -> None: + """ + Run educational UMAP on the Iris dataset and print the first 5 embeddings. + + >>> main() # doctest: +ELLIPSIS + === Educational UMAP on the Iris Dataset === + ... + """ + print("=== Educational UMAP on the Iris Dataset ===") + print() + print("Loading Iris dataset …") + features, labels = collect_dataset() + print(f" Input shape : {features.shape} ((150 flowers x 2 dimensions))") + print() + + print("Running UMAP (n_neighbors=5, n_epochs=50) …") + print(" (Using reduced epochs for speed — increase for better results)") + embedding = apply_umap( + features, + n_components=2, + n_neighbors=5, + n_epochs=50, + random_state=0, + ) + + print(f" Output shape : {embedding.shape} ((150 flowers x 2 dimensions))") + print() + + print("First 5 embedded points (each flower is now a 2-D coordinate):") + print(f" {'Point':>5} {'Species':>10} {'Dim-1':>10} {'Dim-2':>10}") + species_names = {0: "Setosa", 1: "Versicolor", 2: "Virginica"} + for idx in range(5): + name = species_names[int(labels[idx])] + x, y = embedding[idx] + print(f" {idx:>5} {name:>10} {x:>10.4f} {y:>10.4f}") + + print() + print("Interpretation:") + print(" Points belonging to the same species should cluster together.") + print(" UMAP has compressed 4-D information into 2-D while preserving") + print(" neighbourhood structure — similar flowers stay near each other.") + + # ── Optional visualisation ────────────────────────────────────────────── + # Uncomment the block below if matplotlib is available: + # + # import matplotlib.pyplot as plt + # colours = ["red", "green", "blue"] + # for species_id in range(3): + # mask = labels == species_id + # plt.scatter( + # embedding[mask, 0], + # embedding[mask, 1], + # c=colours[species_id], + # label=species_names[species_id], + # alpha=0.7, + # ) + # plt.title("Educational UMAP — Iris Dataset") + # plt.xlabel("UMAP Dimension 1") + # plt.ylabel("UMAP Dimension 2") + # plt.legend() + # plt.tight_layout() + # plt.show() + + +if __name__ == "__main__": + doctest.testmod(verbose=False) + main()