Source code for neurodsp.utils.outliers
"""Outlier & missing data related utility functions."""
import numpy as np
###################################################################################################
###################################################################################################
[docs]def remove_nans(sig):
"""Drop any NaNs on the edges of an array.
Parameters
----------
sig : 1d or 2d array
Signal to be checked for edge NaNs.
Returns
-------
sig_removed : 1d or 2d array
Signal with NaN edges removed.
sig_nans : 1d array
Boolean array indicating where NaNs were in the original array.
Notes
-----
For 2d arrays, this function assumes the same columns to be NaN across all rows.
"""
sig_nans = np.isnan(sig)
if sig.ndim == 1:
sig_removed = sig[np.where(~sig_nans)]
elif sig.ndim == 2:
sig_removed = sig[~sig_nans].reshape(sig_nans.shape[0], sum(~sig_nans[0, :]))
sig_nans = sig_nans[0, :]
else:
raise ValueError('Only 1d or 2d arrays supported.')
return sig_removed, sig_nans
[docs]def restore_nans(sig, sig_nans, dtype=float):
"""Restore NaN values to the edges of an array.
Parameters
----------
sig : 1d or 2d array
Signal that has had NaN edges removed.
sig_nans : 1d array
Boolean array indicating where NaNs were in the original array.
Returns
-------
sig_restored : 1d or 2d array
Signal with NaN edges restored.
Notes
-----
If sig is 2d, the sig_nans input should reflect the values for a row.
This function assumes the same columns to be NaN across all rows.
"""
if sig.ndim == 1:
sig_restored = np.ones(len(sig_nans), dtype=dtype) * np.nan
sig_restored[~sig_nans] = sig
elif sig.ndim == 2:
sig_restored = np.ones([sig.shape[0], len(sig_nans)], dtype=dtype) * np.nan
sig_restored[:, np.where(sig_nans == False)[0]] = sig
else:
raise ValueError('Only 1d or 2d arrays supported.')
return sig_restored
[docs]def discard_outliers(data, outlier_percent):
"""Discard outlier arrays with high values.
Parameters
----------
data : 2d or 3d array
Array to remove outliers from.
outlier_percent : float
The percentage of outlier values to be removed. Must be between 0 and 100.
Returns
-------
data : array
Array after removing outliers.
Notes
-----
This function drops entries across the last dimension.
Values are dropped based on being an outlier in log10 spacing.
"""
# Get the number of arrays to discard - round up so it doesn't get a zero.
n_discard = int(np.ceil(data.shape[-1] / 100. * outlier_percent))
# Check discard settings compared to data size
if n_discard >= data.shape[-1]:
raise ValueError('Outlier removal would discard all data. Can not proceed.')
# Make 2D -> 3D for looping across array
data = data[np.newaxis, :, :] if data.ndim == 2 else data
# Select the windows to keep from each 2D component of the input data
data = [dat[:, np.argsort(np.mean(np.log10(dat), axis=0))[:-n_discard]] for dat in data]
# Reshape array and squeeze to drop back to 2D if that was original shape
data = np.squeeze(np.stack(data))
# Ensure output maintains the correct shape, keeping 2D if ends up as 1D
if data.ndim == 1:
data = data[:, np.newaxis]
return data