rust-ml
diff --git a/‎algorithms/linfa-reduction/Cargo.toml
Lines changed: 8 additions & 1 deletion b/‎algorithms/linfa-reduction/Cargo.toml
Lines changed: 8 additions & 1 deletion
diff --git a/‎algorithms/linfa-reduction/README.md
Lines changed: 4 additions & 0 deletions b/‎algorithms/linfa-reduction/README.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎algorithms/linfa-reduction/examples/gaussian_projection.rs
Lines changed: 76 additions & 0 deletions b/‎algorithms/linfa-reduction/examples/gaussian_projection.rs
Lines changed: 76 additions & 0 deletions
diff --git a/‎algorithms/linfa-reduction/examples/sparse_projection.rs
Lines changed: 76 additions & 0 deletions b/‎algorithms/linfa-reduction/examples/sparse_projection.rs
Lines changed: 76 additions & 0 deletions
diff --git a/‎algorithms/linfa-reduction/src/error.rs
Lines changed: 8 additions & 0 deletions b/‎algorithms/linfa-reduction/src/error.rs
Lines changed: 8 additions & 0 deletions
diff --git a/‎algorithms/linfa-reduction/src/lib.rs
Lines changed: 1 addition & 0 deletions b/‎algorithms/linfa-reduction/src/lib.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎algorithms/linfa-reduction/src/random_projection/algorithms.rs
Lines changed: 169 additions & 0 deletions b/‎algorithms/linfa-reduction/src/random_projection/algorithms.rs
Lines changed: 169 additions & 0 deletions
@@ -1,7 +1,10 @@
 [package]
 name = "linfa-reduction"
 version = "0.7.0"
-authors = ["Lorenz Schmidt <[email protected]>"]
+authors = [
+    "Lorenz Schmidt <[email protected]>",
+    "Gabriel Bathie <[email protected]>",
+]
 description = "A collection of dimensionality reduction techniques"
 edition = "2018"
 license = "MIT OR Apache-2.0"
@@ -41,6 +44,8 @@ rand = { version = "0.8", features = ["small_rng"] }
 
 linfa = { version = "0.7.0", path = "../.." }
 linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" }
+sprs = "0.11.1"
+rand_xoshiro = "0.6.0"
 
 [dev-dependencies]
 ndarray-npy = { version = "0.8", default-features = false }
@@ -49,3 +54,5 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [
     "generate",
 ] }
 approx = { version = "0.4" }
+mnist = { version = "0.6.0", features = ["download"] }
+linfa-trees = { version = "0.7.0", path = "../linfa-trees"}
@@ -11,6 +11,8 @@
 `linfa-reduction` currently provides an implementation of the following dimensional reduction methods: 
 - Diffusion Mapping
 - Principal Component Analysis (PCA)
+- Gaussian random projections
+- Sparse random projections
 
 ## Examples
 
@@ -19,6 +21,8 @@ There is an usage example in the `examples/` directory. To run, use:
 ```bash
 $ cargo run --release --example diffusion_map
 $ cargo run --release --example pca
+$ cargo run --release --example gaussian_projection
+$ cargo run --release --example sparse_projection
 ```
 
 ## BLAS/LAPACK backend
 
@@ -0,0 +1,76 @@
+use std::{error::Error, time::Instant};
+
+use linfa::prelude::*;
+use linfa_reduction::random_projection::GaussianRandomProjection;
+use linfa_trees::{DecisionTree, SplitQuality};
+
+use mnist::{MnistBuilder, NormalizedMnist};
+use ndarray::{Array1, Array2};
+use rand::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+
+/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
+fn main() -> Result<(), Box<dyn Error>> {
+    // Parameters
+    let train_sz = 10_000usize;
+    let test_sz = 1_000usize;
+    let reduced_dim = 100;
+    let rng = Xoshiro256Plus::seed_from_u64(42);
+
+    let NormalizedMnist {
+        trn_img,
+        trn_lbl,
+        tst_img,
+        tst_lbl,
+        ..
+    } = MnistBuilder::new()
+        .label_format_digit()
+        .training_set_length(train_sz as u32)
+        .test_set_length(test_sz as u32)
+        .download_and_extract()
+        .finalize()
+        .normalize();
+
+    let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
+    let train_labels: Array1<usize> =
+        Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
+    let train_dataset = Dataset::new(train_data, train_labels);
+
+    let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
+    let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
+
+    let params = DecisionTree::params()
+        .split_quality(SplitQuality::Gini)
+        .max_depth(Some(10));
+
+    println!("Training non-reduced model...");
+    let start = Instant::now();
+    let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
+
+    let end = start.elapsed();
+    let pred_y = model.predict(&test_data);
+    let cm = pred_y.confusion_matrix(&test_labels)?;
+    println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
+    println!("Training time: {:.2}s\n", end.as_secs_f32());
+
+    println!("Training reduced model...");
+    let start = Instant::now();
+    // Compute the random projection and train the model on the reduced dataset.
+    let proj = GaussianRandomProjection::<f32>::params_with_rng(rng)
+        .target_dim(reduced_dim)
+        .fit(&train_dataset)?;
+    let reduced_train_ds = proj.transform(&train_dataset);
+    let reduced_test_data = proj.transform(&test_data);
+    let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
+
+    let end = start.elapsed();
+    let pred_reduced = model_reduced.predict(&reduced_test_data);
+    let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
+    println!(
+        "Reduced model precision: {}%",
+        100.0 * cm_reduced.accuracy()
+    );
+    println!("Reduction + training time: {:.2}s", end.as_secs_f32());
+
+    Ok(())
+}
@@ -0,0 +1,76 @@
+use std::{error::Error, time::Instant};
+
+use linfa::prelude::*;
+use linfa_reduction::random_projection::SparseRandomProjection;
+use linfa_trees::{DecisionTree, SplitQuality};
+
+use mnist::{MnistBuilder, NormalizedMnist};
+use ndarray::{Array1, Array2};
+use rand::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+
+/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
+fn main() -> Result<(), Box<dyn Error>> {
+    // Parameters
+    let train_sz = 10_000usize;
+    let test_sz = 1_000usize;
+    let reduced_dim = 100;
+    let rng = Xoshiro256Plus::seed_from_u64(42);
+
+    let NormalizedMnist {
+        trn_img,
+        trn_lbl,
+        tst_img,
+        tst_lbl,
+        ..
+    } = MnistBuilder::new()
+        .label_format_digit()
+        .training_set_length(train_sz as u32)
+        .test_set_length(test_sz as u32)
+        .download_and_extract()
+        .finalize()
+        .normalize();
+
+    let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
+    let train_labels: Array1<usize> =
+        Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
+    let train_dataset = Dataset::new(train_data, train_labels);
+
+    let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
+    let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
+
+    let params = DecisionTree::params()
+        .split_quality(SplitQuality::Gini)
+        .max_depth(Some(10));
+
+    println!("Training non-reduced model...");
+    let start = Instant::now();
+    let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
+
+    let end = start.elapsed();
+    let pred_y = model.predict(&test_data);
+    let cm = pred_y.confusion_matrix(&test_labels)?;
+    println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
+    println!("Training time: {:.2}s\n", end.as_secs_f32());
+
+    println!("Training reduced model...");
+    let start = Instant::now();
+    // Compute the random projection and train the model on the reduced dataset.
+    let proj = SparseRandomProjection::<f32>::params_with_rng(rng)
+        .target_dim(reduced_dim)
+        .fit(&train_dataset)?;
+    let reduced_train_ds = proj.transform(&train_dataset);
+    let reduced_test_data = proj.transform(&test_data);
+    let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
+
+    let end = start.elapsed();
+    let pred_reduced = model_reduced.predict(&reduced_test_data);
+    let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
+    println!(
+        "Reduced model precision: {}%",
+        100.0 * cm_reduced.accuracy()
+    );
+    println!("Reduction + training time: {:.2}s", end.as_secs_f32());
+
+    Ok(())
+}
@@ -18,4 +18,12 @@ pub enum ReductionError {
     LinalgError(#[from] linfa_linalg::LinalgError),
     #[error(transparent)]
     LinfaError(#[from] linfa::error::Error),
+    #[error(transparent)]
+    NdarrayRandError(#[from] ndarray_rand::rand_distr::NormalError),
+    #[error("Precision parameter must be in the interval (0; 1)")]
+    InvalidPrecision,
+    #[error("Target dimension of the projection must be positive")]
+    NonPositiveEmbeddingSize,
+    #[error("Target dimension {0} is larger than the number of features {1}.")]
+    DimensionIncrease(usize, usize),
 }
@@ -6,6 +6,7 @@ extern crate ndarray;
 mod diffusion_map;
 mod error;
 mod pca;
+pub mod random_projection;
 pub mod utils;
 
 pub use diffusion_map::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams};
 
@@ -0,0 +1,169 @@
+use std::marker::PhantomData;
+
+use linfa::{
+    dataset::{AsTargets, FromTargetArray},
+    prelude::Records,
+    traits::{Fit, Transformer},
+    DatasetBase, Float,
+};
+use ndarray::{linalg::Dot, Array2, ArrayBase, Data, Ix2};
+
+use rand::{prelude::Distribution, Rng, SeedableRng};
+use rand_xoshiro::Xoshiro256Plus;
+
+use super::hyperparams::RandomProjectionParamsInner;
+use super::{common::johnson_lindenstrauss_min_dim, methods::ProjectionMethod};
+use super::{RandomProjectionParams, RandomProjectionValidParams};
+use crate::ReductionError;
+
+/// Embedding via random projection
+pub struct RandomProjection<Proj: ProjectionMethod, F: Float>
+where
+    Proj::RandomDistribution: Distribution<F>,
+{
+    projection: Proj::ProjectionMatrix<F>,
+}
+
+impl<F, Proj, Rec, T, R> Fit<Rec, T, ReductionError> for RandomProjectionValidParams<Proj, R>
+where
+    F: Float,
+    Proj: ProjectionMethod,
+    Rec: Records<Elem = F>,
+    R: Rng + Clone,
+    Proj::RandomDistribution: Distribution<F>,
+{
+    type Object = RandomProjection<Proj, F>;
+
+    fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
+        let n_samples = dataset.nsamples();
+        let n_features = dataset.nfeatures();
+        let mut rng = self.rng.clone();
+
+        let n_dims = match &self.params {
+            RandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
+            RandomProjectionParamsInner::Epsilon { eps } => {
+                johnson_lindenstrauss_min_dim(n_samples, *eps)
+            }
+        };
+
+        if n_dims > n_features {
+            return Err(ReductionError::DimensionIncrease(n_dims, n_features));
+        }
+
+        let projection = Proj::generate_matrix(n_features, n_dims, &mut rng)?;
+
+        Ok(RandomProjection { projection })
+    }
+}
+
+impl<Proj: ProjectionMethod, F: Float> RandomProjection<Proj, F>
+where
+    Proj::RandomDistribution: Distribution<F>,
+{
+    /// Create new parameters for a [`RandomProjection`] with default value
+    /// `eps = 0.1` and a [`Xoshiro256Plus`] RNG.
+    pub fn params() -> RandomProjectionParams<Proj, Xoshiro256Plus> {
+        RandomProjectionParams(RandomProjectionValidParams {
+            params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
+            rng: Xoshiro256Plus::seed_from_u64(42),
+            marker: PhantomData,
+        })
+    }
+
+    /// Create new parameters for a [`RandomProjection`] with default values
+    /// `eps = 0.1` and the provided [`Rng`].
+    pub fn params_with_rng<R>(rng: R) -> RandomProjectionParams<Proj, R>
+    where
+        R: Rng + Clone,
+    {
+        RandomProjectionParams(RandomProjectionValidParams {
+            params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
+            rng,
+            marker: PhantomData,
+        })
+    }
+}
+
+impl<Proj, F, D> Transformer<&ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
+where
+    Proj: ProjectionMethod,
+    F: Float,
+    D: Data<Elem = F>,
+    ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+    Proj::RandomDistribution: Distribution<F>,
+{
+    /// Compute the embedding of a two-dimensional array
+    fn transform(&self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
+        x.dot(&self.projection)
+    }
+}
+
+impl<Proj, F, D> Transformer<ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
+where
+    Proj: ProjectionMethod,
+    F: Float,
+    D: Data<Elem = F>,
+    ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+    Proj::RandomDistribution: Distribution<F>,
+{
+    /// Compute the embedding of a two-dimensional array
+    fn transform(&self, x: ArrayBase<D, Ix2>) -> Array2<F> {
+        self.transform(&x)
+    }
+}
+
+impl<Proj, F, T> Transformer<DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T>>
+    for RandomProjection<Proj, F>
+where
+    Proj: ProjectionMethod,
+    F: Float,
+    T: AsTargets,
+    for<'a> ArrayBase<ndarray::ViewRepr<&'a F>, Ix2>:
+        Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+    Proj::RandomDistribution: Distribution<F>,
+{
+    /// Compute the embedding of a dataset
+    ///
+    /// # Parameter
+    ///
+    /// * `data`: a dataset
+    ///
+    /// # Returns
+    ///
+    /// New dataset, with data equal to the embedding of the input data
+    fn transform(&self, data: DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T> {
+        let new_records = self.transform(data.records().view());
+
+        DatasetBase::new(new_records, data.targets)
+    }
+}
+
+impl<'a, Proj, F, L, T> Transformer<&'a DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T::View>>
+    for RandomProjection<Proj, F>
+where
+    Proj: ProjectionMethod,
+    F: Float,
+    L: 'a,
+    T: AsTargets<Elem = L> + FromTargetArray<'a>,
+    for<'b> ArrayBase<ndarray::ViewRepr<&'b F>, Ix2>:
+        Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+    Proj::RandomDistribution: Distribution<F>,
+{
+    /// Compute the embedding of a dataset
+    ///
+    /// # Parameter
+    ///
+    /// * `data`: a dataset
+    ///
+    /// # Returns
+    ///
+    /// New dataset, with data equal to the embedding of the input data
+    fn transform(&self, data: &'a DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T::View> {
+        let new_records = self.transform(data.records().view());
+
+        DatasetBase::new(
+            new_records,
+            T::new_targets_view(AsTargets::as_targets(data)),
+        )
+    }
+}