Skip to content

Commit 2eaa686

Browse files
authored
Feat: Implement Random Projections (#332)
* Add random projections algorithms for dimensionality reduction. Contains two algorithms based on variants on the Johnson-lindenstrauss lemma: - Random projections with Gaussian coefficients - Sparse random projections with +/- 1 coefficients (multiplied by a scaling factor). * Update readme * Add RNG to random projection structs RNG defaults to Xoshiro256Plus if not provided by user. Also added tests for minimum dimension using values from scikit-learn. * Check that random projections actually reduce the dimension of the data. * Use fixed dimension in error tests * Refactor random projections code
1 parent a29abe8 commit 2eaa686

File tree

11 files changed

+726
-1
lines changed

11 files changed

+726
-1
lines changed

algorithms/linfa-reduction/Cargo.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
[package]
22
name = "linfa-reduction"
33
version = "0.7.0"
4-
authors = ["Lorenz Schmidt <[email protected]>"]
4+
authors = [
5+
"Lorenz Schmidt <[email protected]>",
6+
"Gabriel Bathie <[email protected]>",
7+
]
58
description = "A collection of dimensionality reduction techniques"
69
edition = "2018"
710
license = "MIT OR Apache-2.0"
@@ -41,6 +44,8 @@ rand = { version = "0.8", features = ["small_rng"] }
4144

4245
linfa = { version = "0.7.0", path = "../.." }
4346
linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" }
47+
sprs = "0.11.1"
48+
rand_xoshiro = "0.6.0"
4449

4550
[dev-dependencies]
4651
ndarray-npy = { version = "0.8", default-features = false }
@@ -49,3 +54,5 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [
4954
"generate",
5055
] }
5156
approx = { version = "0.4" }
57+
mnist = { version = "0.6.0", features = ["download"] }
58+
linfa-trees = { version = "0.7.0", path = "../linfa-trees"}

algorithms/linfa-reduction/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
`linfa-reduction` currently provides an implementation of the following dimensional reduction methods:
1212
- Diffusion Mapping
1313
- Principal Component Analysis (PCA)
14+
- Gaussian random projections
15+
- Sparse random projections
1416

1517
## Examples
1618

@@ -19,6 +21,8 @@ There is an usage example in the `examples/` directory. To run, use:
1921
```bash
2022
$ cargo run --release --example diffusion_map
2123
$ cargo run --release --example pca
24+
$ cargo run --release --example gaussian_projection
25+
$ cargo run --release --example sparse_projection
2226
```
2327

2428
## BLAS/LAPACK backend
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use std::{error::Error, time::Instant};
2+
3+
use linfa::prelude::*;
4+
use linfa_reduction::random_projection::GaussianRandomProjection;
5+
use linfa_trees::{DecisionTree, SplitQuality};
6+
7+
use mnist::{MnistBuilder, NormalizedMnist};
8+
use ndarray::{Array1, Array2};
9+
use rand::SeedableRng;
10+
use rand_xoshiro::Xoshiro256Plus;
11+
12+
/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
13+
fn main() -> Result<(), Box<dyn Error>> {
14+
// Parameters
15+
let train_sz = 10_000usize;
16+
let test_sz = 1_000usize;
17+
let reduced_dim = 100;
18+
let rng = Xoshiro256Plus::seed_from_u64(42);
19+
20+
let NormalizedMnist {
21+
trn_img,
22+
trn_lbl,
23+
tst_img,
24+
tst_lbl,
25+
..
26+
} = MnistBuilder::new()
27+
.label_format_digit()
28+
.training_set_length(train_sz as u32)
29+
.test_set_length(test_sz as u32)
30+
.download_and_extract()
31+
.finalize()
32+
.normalize();
33+
34+
let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
35+
let train_labels: Array1<usize> =
36+
Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
37+
let train_dataset = Dataset::new(train_data, train_labels);
38+
39+
let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
40+
let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
41+
42+
let params = DecisionTree::params()
43+
.split_quality(SplitQuality::Gini)
44+
.max_depth(Some(10));
45+
46+
println!("Training non-reduced model...");
47+
let start = Instant::now();
48+
let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
49+
50+
let end = start.elapsed();
51+
let pred_y = model.predict(&test_data);
52+
let cm = pred_y.confusion_matrix(&test_labels)?;
53+
println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
54+
println!("Training time: {:.2}s\n", end.as_secs_f32());
55+
56+
println!("Training reduced model...");
57+
let start = Instant::now();
58+
// Compute the random projection and train the model on the reduced dataset.
59+
let proj = GaussianRandomProjection::<f32>::params_with_rng(rng)
60+
.target_dim(reduced_dim)
61+
.fit(&train_dataset)?;
62+
let reduced_train_ds = proj.transform(&train_dataset);
63+
let reduced_test_data = proj.transform(&test_data);
64+
let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
65+
66+
let end = start.elapsed();
67+
let pred_reduced = model_reduced.predict(&reduced_test_data);
68+
let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
69+
println!(
70+
"Reduced model precision: {}%",
71+
100.0 * cm_reduced.accuracy()
72+
);
73+
println!("Reduction + training time: {:.2}s", end.as_secs_f32());
74+
75+
Ok(())
76+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use std::{error::Error, time::Instant};
2+
3+
use linfa::prelude::*;
4+
use linfa_reduction::random_projection::SparseRandomProjection;
5+
use linfa_trees::{DecisionTree, SplitQuality};
6+
7+
use mnist::{MnistBuilder, NormalizedMnist};
8+
use ndarray::{Array1, Array2};
9+
use rand::SeedableRng;
10+
use rand_xoshiro::Xoshiro256Plus;
11+
12+
/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
13+
fn main() -> Result<(), Box<dyn Error>> {
14+
// Parameters
15+
let train_sz = 10_000usize;
16+
let test_sz = 1_000usize;
17+
let reduced_dim = 100;
18+
let rng = Xoshiro256Plus::seed_from_u64(42);
19+
20+
let NormalizedMnist {
21+
trn_img,
22+
trn_lbl,
23+
tst_img,
24+
tst_lbl,
25+
..
26+
} = MnistBuilder::new()
27+
.label_format_digit()
28+
.training_set_length(train_sz as u32)
29+
.test_set_length(test_sz as u32)
30+
.download_and_extract()
31+
.finalize()
32+
.normalize();
33+
34+
let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
35+
let train_labels: Array1<usize> =
36+
Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
37+
let train_dataset = Dataset::new(train_data, train_labels);
38+
39+
let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
40+
let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
41+
42+
let params = DecisionTree::params()
43+
.split_quality(SplitQuality::Gini)
44+
.max_depth(Some(10));
45+
46+
println!("Training non-reduced model...");
47+
let start = Instant::now();
48+
let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
49+
50+
let end = start.elapsed();
51+
let pred_y = model.predict(&test_data);
52+
let cm = pred_y.confusion_matrix(&test_labels)?;
53+
println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
54+
println!("Training time: {:.2}s\n", end.as_secs_f32());
55+
56+
println!("Training reduced model...");
57+
let start = Instant::now();
58+
// Compute the random projection and train the model on the reduced dataset.
59+
let proj = SparseRandomProjection::<f32>::params_with_rng(rng)
60+
.target_dim(reduced_dim)
61+
.fit(&train_dataset)?;
62+
let reduced_train_ds = proj.transform(&train_dataset);
63+
let reduced_test_data = proj.transform(&test_data);
64+
let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
65+
66+
let end = start.elapsed();
67+
let pred_reduced = model_reduced.predict(&reduced_test_data);
68+
let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
69+
println!(
70+
"Reduced model precision: {}%",
71+
100.0 * cm_reduced.accuracy()
72+
);
73+
println!("Reduction + training time: {:.2}s", end.as_secs_f32());
74+
75+
Ok(())
76+
}

algorithms/linfa-reduction/src/error.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,12 @@ pub enum ReductionError {
1818
LinalgError(#[from] linfa_linalg::LinalgError),
1919
#[error(transparent)]
2020
LinfaError(#[from] linfa::error::Error),
21+
#[error(transparent)]
22+
NdarrayRandError(#[from] ndarray_rand::rand_distr::NormalError),
23+
#[error("Precision parameter must be in the interval (0; 1)")]
24+
InvalidPrecision,
25+
#[error("Target dimension of the projection must be positive")]
26+
NonPositiveEmbeddingSize,
27+
#[error("Target dimension {0} is larger than the number of features {1}.")]
28+
DimensionIncrease(usize, usize),
2129
}

algorithms/linfa-reduction/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ extern crate ndarray;
66
mod diffusion_map;
77
mod error;
88
mod pca;
9+
pub mod random_projection;
910
pub mod utils;
1011

1112
pub use diffusion_map::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams};
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
use std::marker::PhantomData;
2+
3+
use linfa::{
4+
dataset::{AsTargets, FromTargetArray},
5+
prelude::Records,
6+
traits::{Fit, Transformer},
7+
DatasetBase, Float,
8+
};
9+
use ndarray::{linalg::Dot, Array2, ArrayBase, Data, Ix2};
10+
11+
use rand::{prelude::Distribution, Rng, SeedableRng};
12+
use rand_xoshiro::Xoshiro256Plus;
13+
14+
use super::hyperparams::RandomProjectionParamsInner;
15+
use super::{common::johnson_lindenstrauss_min_dim, methods::ProjectionMethod};
16+
use super::{RandomProjectionParams, RandomProjectionValidParams};
17+
use crate::ReductionError;
18+
19+
/// Embedding via random projection
20+
pub struct RandomProjection<Proj: ProjectionMethod, F: Float>
21+
where
22+
Proj::RandomDistribution: Distribution<F>,
23+
{
24+
projection: Proj::ProjectionMatrix<F>,
25+
}
26+
27+
impl<F, Proj, Rec, T, R> Fit<Rec, T, ReductionError> for RandomProjectionValidParams<Proj, R>
28+
where
29+
F: Float,
30+
Proj: ProjectionMethod,
31+
Rec: Records<Elem = F>,
32+
R: Rng + Clone,
33+
Proj::RandomDistribution: Distribution<F>,
34+
{
35+
type Object = RandomProjection<Proj, F>;
36+
37+
fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
38+
let n_samples = dataset.nsamples();
39+
let n_features = dataset.nfeatures();
40+
let mut rng = self.rng.clone();
41+
42+
let n_dims = match &self.params {
43+
RandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
44+
RandomProjectionParamsInner::Epsilon { eps } => {
45+
johnson_lindenstrauss_min_dim(n_samples, *eps)
46+
}
47+
};
48+
49+
if n_dims > n_features {
50+
return Err(ReductionError::DimensionIncrease(n_dims, n_features));
51+
}
52+
53+
let projection = Proj::generate_matrix(n_features, n_dims, &mut rng)?;
54+
55+
Ok(RandomProjection { projection })
56+
}
57+
}
58+
59+
impl<Proj: ProjectionMethod, F: Float> RandomProjection<Proj, F>
60+
where
61+
Proj::RandomDistribution: Distribution<F>,
62+
{
63+
/// Create new parameters for a [`RandomProjection`] with default value
64+
/// `eps = 0.1` and a [`Xoshiro256Plus`] RNG.
65+
pub fn params() -> RandomProjectionParams<Proj, Xoshiro256Plus> {
66+
RandomProjectionParams(RandomProjectionValidParams {
67+
params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
68+
rng: Xoshiro256Plus::seed_from_u64(42),
69+
marker: PhantomData,
70+
})
71+
}
72+
73+
/// Create new parameters for a [`RandomProjection`] with default values
74+
/// `eps = 0.1` and the provided [`Rng`].
75+
pub fn params_with_rng<R>(rng: R) -> RandomProjectionParams<Proj, R>
76+
where
77+
R: Rng + Clone,
78+
{
79+
RandomProjectionParams(RandomProjectionValidParams {
80+
params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
81+
rng,
82+
marker: PhantomData,
83+
})
84+
}
85+
}
86+
87+
impl<Proj, F, D> Transformer<&ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
88+
where
89+
Proj: ProjectionMethod,
90+
F: Float,
91+
D: Data<Elem = F>,
92+
ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
93+
Proj::RandomDistribution: Distribution<F>,
94+
{
95+
/// Compute the embedding of a two-dimensional array
96+
fn transform(&self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
97+
x.dot(&self.projection)
98+
}
99+
}
100+
101+
impl<Proj, F, D> Transformer<ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
102+
where
103+
Proj: ProjectionMethod,
104+
F: Float,
105+
D: Data<Elem = F>,
106+
ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
107+
Proj::RandomDistribution: Distribution<F>,
108+
{
109+
/// Compute the embedding of a two-dimensional array
110+
fn transform(&self, x: ArrayBase<D, Ix2>) -> Array2<F> {
111+
self.transform(&x)
112+
}
113+
}
114+
115+
impl<Proj, F, T> Transformer<DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T>>
116+
for RandomProjection<Proj, F>
117+
where
118+
Proj: ProjectionMethod,
119+
F: Float,
120+
T: AsTargets,
121+
for<'a> ArrayBase<ndarray::ViewRepr<&'a F>, Ix2>:
122+
Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
123+
Proj::RandomDistribution: Distribution<F>,
124+
{
125+
/// Compute the embedding of a dataset
126+
///
127+
/// # Parameter
128+
///
129+
/// * `data`: a dataset
130+
///
131+
/// # Returns
132+
///
133+
/// New dataset, with data equal to the embedding of the input data
134+
fn transform(&self, data: DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T> {
135+
let new_records = self.transform(data.records().view());
136+
137+
DatasetBase::new(new_records, data.targets)
138+
}
139+
}
140+
141+
impl<'a, Proj, F, L, T> Transformer<&'a DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T::View>>
142+
for RandomProjection<Proj, F>
143+
where
144+
Proj: ProjectionMethod,
145+
F: Float,
146+
L: 'a,
147+
T: AsTargets<Elem = L> + FromTargetArray<'a>,
148+
for<'b> ArrayBase<ndarray::ViewRepr<&'b F>, Ix2>:
149+
Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
150+
Proj::RandomDistribution: Distribution<F>,
151+
{
152+
/// Compute the embedding of a dataset
153+
///
154+
/// # Parameter
155+
///
156+
/// * `data`: a dataset
157+
///
158+
/// # Returns
159+
///
160+
/// New dataset, with data equal to the embedding of the input data
161+
fn transform(&self, data: &'a DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T::View> {
162+
let new_records = self.transform(data.records().view());
163+
164+
DatasetBase::new(
165+
new_records,
166+
T::new_targets_view(AsTargets::as_targets(data)),
167+
)
168+
}
169+
}

0 commit comments

Comments
 (0)