Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 232 additions & 0 deletions src/model_selection/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ extern crate rand;
use crate::linalg::BaseVector;
use crate::linalg::Matrix;
use crate::math::num::RealNumber;
use rand::seq::SliceRandom;
use rand::thread_rng;
use rand::Rng;

/// Splits data into 2 disjoint datasets.
Expand Down Expand Up @@ -81,6 +83,113 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
(x_train, x_test, y_train, y_test)
}

///
/// KFold Cross-Validation
///
pub trait BaseKFold {
/// Returns integer indices corresponding to test sets
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>>;

/// Returns masksk corresponding to test sets
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>>;

/// Return a tuple containing the the training set indices for that split and
/// the testing set indices for that split.
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)>;
}

///
/// An implementation of KFold
///
pub struct KFold {
n_splits: usize, // cannot exceed std::usize::MAX
shuffle: bool,
// TODO: to be implemented later
// random_state: i32,
}

impl Default for KFold {
fn default() -> KFold {
KFold {
n_splits: 3 as usize,
shuffle: true,
}
}
}

///
/// Abstract class for all KFold functionalities
///
impl BaseKFold for KFold {
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>> {
// number of samples (rows) in the matrix
let n_samples: usize = x.shape().0;

// initialise indices
let mut indices: Vec<usize> = (0..n_samples).collect();
if self.shuffle == true {
indices.shuffle(&mut thread_rng());
}
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];

// increment by one if odd
for i in 0..(n_samples % self.n_splits) {
fold_sizes[i] = fold_sizes[i] + 1;
}

// generate the right array of arrays for test indices
let mut return_values: Vec<Vec<usize>> = Vec::with_capacity(self.n_splits);
let mut current: usize = 0;
for fold_size in fold_sizes.drain(..) {
let stop = current + fold_size;
return_values.push(indices[current..stop].to_vec());
current = stop
}

return_values
}

fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>> {
let mut return_values: Vec<Vec<bool>> = Vec::with_capacity(self.n_splits);
for test_index in self.test_indices(x).drain(..) {
// init mask
let mut test_mask = vec![false; x.shape().0];
// set mask's indices to true according to test indices
for i in test_index {
test_mask[i] = true; // can be implemented with map()
}
return_values.push(test_mask);
}
return_values
}

fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)> {
let n_samples: usize = x.shape().0;
let indices: Vec<usize> = (0..n_samples).collect();

let mut return_values: Vec<(Vec<usize>, Vec<usize>)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs

for test_index in self.test_masks(x).drain(..) {
let train_index = indices
.clone()
.iter()
.enumerate()
.filter(|&(idx, _)| test_index[idx] == false)
.map(|(idx, _)| idx)
.collect::<Vec<usize>>(); // filter train indices out according to mask
let test_index = indices
.iter()
.enumerate()
.filter(|&(idx, _)| test_index[idx] == true)
.map(|(idx, _)| idx)
.collect::<Vec<usize>>(); // filter tests indices out according to mask
return_values.push((train_index, test_index))
}
return_values
}
}

#[cfg(test)]
mod tests {

Expand All @@ -106,4 +215,127 @@ mod tests {
assert_eq!(x_train.shape().0, y_train.len());
assert_eq!(x_test.shape().0, y_test.len());
}

#[test]
fn run_kfold_return_test_indices_simple() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
let test_indices = k.test_indices(&x);

assert_eq!(test_indices[0], (0..11).collect::<Vec<usize>>());
assert_eq!(test_indices[1], (11..22).collect::<Vec<usize>>());
assert_eq!(test_indices[2], (22..33).collect::<Vec<usize>>());
}

#[test]
fn run_kfold_return_test_indices_odd() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
let test_indices = k.test_indices(&x);

assert_eq!(test_indices[0], (0..12).collect::<Vec<usize>>());
assert_eq!(test_indices[1], (12..23).collect::<Vec<usize>>());
assert_eq!(test_indices[2], (23..34).collect::<Vec<usize>>());
}

#[test]
fn run_kfold_return_test_mask_simple() {
let k = KFold {
n_splits: 2,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let test_masks = k.test_masks(&x);

for t in &test_masks[0][0..11] {
// TODO: this can be prob done better
assert_eq!(*t, true)
}
for t in &test_masks[0][11..22] {
assert_eq!(*t, false)
}

for t in &test_masks[1][0..11] {
assert_eq!(*t, false)
}
for t in &test_masks[1][11..22] {
assert_eq!(*t, true)
}
}

#[test]
fn run_kfold_return_split_simple() {
let k = KFold {
n_splits: 2,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let train_test_splits = k.split(&x);

assert_eq!(train_test_splits[0].1, (0..11).collect::<Vec<usize>>());
assert_eq!(train_test_splits[0].0, (11..22).collect::<Vec<usize>>());
assert_eq!(train_test_splits[1].0, (0..11).collect::<Vec<usize>>());
assert_eq!(train_test_splits[1].1, (11..22).collect::<Vec<usize>>());
}

#[test]
fn run_kfold_return_split_simple_shuffle() {
let k = KFold {
n_splits: 2,
..KFold::default()
};
let x: DenseMatrix<f64> = DenseMatrix::rand(23, 100);
let train_test_splits = k.split(&x);

assert_eq!(train_test_splits[0].1.len(), 12 as usize);
assert_eq!(train_test_splits[0].0.len(), 11 as usize);
assert_eq!(train_test_splits[1].0.len(), 12 as usize);
assert_eq!(train_test_splits[1].1.len(), 11 as usize);
}

#[test]
fn numpy_parity_test() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
];
for ((train, test), (expected_train, expected_test)) in
k.split(&x).into_iter().zip(expected)
{
assert_eq!(test, expected_test);
assert_eq!(train, expected_train);
}
}

#[test]
fn numpy_parity_test_shuffle() {
let k = KFold {
n_splits: 3,
..KFold::default()
};
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
];
for ((train, test), (expected_train, expected_test)) in
k.split(&x).into_iter().zip(expected)
{
assert_eq!(test.len(), expected_test.len());
assert_eq!(train.len(), expected_train.len());
}
}
}