Source code for sparse_nmf.data

"""Sample data loaders for the sparse_nmf package.

A few small synthetic datasets shipped with the package so users can
run the examples / tests without an external download. Each dataset is
constructed to have a known low-rank-plus-noise structure, which makes
it useful both as a doctest fixture and as a sanity check that NMF is
recovering something meaningful.

Public helpers:

- :func:`generate_synthetic_sparse` — programmatic factory for a
  ``(n_samples, n_features)`` CSR matrix with controllable rank and
  density. Deterministic via ``seed``.
- :func:`load_synthetic_sparse` — loads the bundled
  ``data/synthetic_sparse.npz`` if present, falls back to generating
  on the fly.
"""

from __future__ import annotations

from importlib import resources

import numpy as np


[docs] def generate_synthetic_sparse( n_samples: int = 500, n_features: int = 1_000, n_components: int = 8, density: float = 0.05, noise: float = 0.1, seed: int = 0, ): """Build a ``(n_samples, n_features)`` CSR matrix with rank-``n_components`` structure plus sparse noise. The matrix is constructed as ``W @ H + noise``, then thresholded to keep only the top ``density`` fraction of entries — emulating the sparsity pattern of, e.g., gene-association count data. Parameters ---------- n_samples, n_features Output shape. n_components Rank of the underlying low-rank structure. NMF with ``n_components`` should recover (close to) this. density Fraction of non-zero entries in the output. noise Standard deviation of additive Gaussian noise on the dense product before thresholding. Larger ``noise`` makes recovery harder. seed RNG seed for reproducibility. Returns ------- scipy.sparse.csr_matrix Shape ``(n_samples, n_features)``, dtype ``float32``, non-negative. """ from scipy.sparse import csr_matrix rng = np.random.default_rng(seed) W = rng.gamma(shape=2.0, scale=1.0, size=(n_samples, n_components)).astype(np.float32) H = rng.gamma(shape=2.0, scale=1.0, size=(n_components, n_features)).astype(np.float32) dense = W @ H dense += noise * rng.standard_normal(dense.shape).astype(np.float32) * dense.std() dense = np.clip(dense, 0.0, None) n_keep = int(density * n_samples * n_features) threshold = np.partition(dense.ravel(), -n_keep)[-n_keep] dense[dense < threshold] = 0.0 return csr_matrix(dense)
[docs] def load_synthetic_sparse(): """Load the bundled ``synthetic_sparse.npz``; generate if missing. Returns the same shape as :func:`generate_synthetic_sparse`'s defaults so callers can switch between the two without changing downstream code. """ try: with resources.files("sparse_nmf.data").joinpath("synthetic_sparse.npz").open("rb") as f: data = np.load(f) from scipy.sparse import csr_matrix return csr_matrix( (data["data"], data["indices"], data["indptr"]), shape=tuple(data["shape"]), ) except (FileNotFoundError, ModuleNotFoundError): return generate_synthetic_sparse()