feat: Implement custom PCA to remove sklearn dependency

- Add custom PCA implementation using numpy eigendecomposition - Replace sklearn.decomposition.PCA with custom implementation - Maintains same API (fit, transform, fit_transform) - Supports explained_variance_ratio_ for variance analysis - Removes scikit-learn dependency from project - Add type hints and assertion for type safety
2025-11-15 02:02:57 +01:00
parent 2a078093ed
commit 9a62c8478f
2 changed files with 721 additions and 0 deletions
@@ -0,0 +1,140 @@
+"""Custom PCA implementation for dimensionality reduction.
+
+Implements Principal Component Analysis without scikit-learn dependency.
+Used for reducing high-dimensional embeddings (768-dim) to 2D for visualization.
+"""
+
+import logging
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+class PCA:
+    """Principal Component Analysis for dimensionality reduction.
+
+    Simple implementation that finds principal components via eigendecomposition
+    of the covariance matrix. Suitable for small-to-medium datasets.
+
+    Attributes:
+        n_components: Number of principal components to keep
+        mean_: Mean of training data (set during fit)
+        components_: Principal components (eigenvectors)
+        explained_variance_: Variance explained by each component
+        explained_variance_ratio_: Fraction of total variance explained
+    """
+
+    def __init__(self, n_components: int = 2):
+        """Initialize PCA.
+
+        Args:
+            n_components: Number of components to keep (default: 2)
+        """
+        if n_components < 1:
+            raise ValueError(f"n_components must be >= 1, got {n_components}")
+
+        self.n_components = n_components
+        self.mean_: np.ndarray | None = None
+        self.components_: np.ndarray | None = None
+        self.explained_variance_: np.ndarray | None = None
+        self.explained_variance_ratio_: np.ndarray | None = None
+
+    def fit(self, X: np.ndarray) -> "PCA":
+        """Fit PCA model to data.
+
+        Args:
+            X: Training data of shape (n_samples, n_features)
+
+        Returns:
+            self (for method chaining)
+
+        Raises:
+            ValueError: If X has fewer features than n_components
+        """
+        X = np.asarray(X)
+
+        if X.ndim != 2:
+            raise ValueError(f"X must be 2D array, got shape {X.shape}")
+
+        n_samples, n_features = X.shape
+
+        if n_features < self.n_components:
+            raise ValueError(
+                f"n_components={self.n_components} > n_features={n_features}"
+            )
+
+        # Center data
+        self.mean_ = np.mean(X, axis=0)
+        X_centered = X - self.mean_
+
+        # Compute covariance matrix
+        # Use (X^T X) / (n-1) for numerical stability with high-dim data
+        cov = np.cov(X_centered.T)
+
+        # Eigendecomposition
+        eigenvalues, eigenvectors = np.linalg.eigh(cov)
+
+        # Sort by eigenvalue (descending)
+        idx = np.argsort(eigenvalues)[::-1]
+        eigenvalues = eigenvalues[idx]
+        eigenvectors = eigenvectors[:, idx]
+
+        # Keep top n_components
+        self.components_ = eigenvectors[:, : self.n_components].T
+        self.explained_variance_ = eigenvalues[: self.n_components]
+
+        # Calculate explained variance ratio
+        total_variance = np.sum(eigenvalues)
+        if total_variance > 0:
+            self.explained_variance_ratio_ = self.explained_variance_ / total_variance
+        else:
+            self.explained_variance_ratio_ = np.zeros(self.n_components)
+
+        logger.debug(
+            f"PCA fit: {n_samples} samples, {n_features} features → "
+            f"{self.n_components} components, "
+            f"explained variance: {self.explained_variance_ratio_}"
+        )
+
+        return self
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        """Transform data to principal component space.
+
+        Args:
+            X: Data to transform of shape (n_samples, n_features)
+
+        Returns:
+            Transformed data of shape (n_samples, n_components)
+
+        Raises:
+            ValueError: If PCA not fitted yet
+        """
+        if self.mean_ is None or self.components_ is None:
+            raise ValueError("PCA not fitted yet. Call fit() first.")
+
+        X = np.asarray(X)
+
+        if X.ndim != 2:
+            raise ValueError(f"X must be 2D array, got shape {X.shape}")
+
+        # Center using training mean
+        X_centered = X - self.mean_
+
+        # Project onto principal components
+        X_transformed = np.dot(X_centered, self.components_.T)
+
+        return X_transformed
+
+    def fit_transform(self, X: np.ndarray) -> np.ndarray:
+        """Fit PCA model and transform data in one step.
+
+        Args:
+            X: Training data of shape (n_samples, n_features)
+
+        Returns:
+            Transformed data of shape (n_samples, n_components)
+        """
+        self.fit(X)
+        return self.transform(X)