Source code for kaggler.feature_selection.feature_selection

import numpy as np
from scipy import sparse
from sklearn import base


[docs]class DropInactive(base.BaseEstimator, base.TransformerMixin): """Drop all zero features. Originally written by Baris Umog (https://www.kaggle.com/barisumog). """ def __init__(self, lowest=25): self.lowest = lowest def fit(self, X, y=None): x = (X > 0.0).astype(bool) s = np.array(x.sum(axis=0)).flatten() self.mask = (s >= self.lowest) return self def transform(self, X): print(self.mask.sum()) return X[:, self.mask]
[docs]class DropLowInfo(base.BaseEstimator, base.TransformerMixin): """Drop features with low information. Originally written by Baris Umog (https://www.kaggle.com/barisumog). """ def __init__(self, margin=0.02, weighted=True): self.margin = margin self.weighted = weighted def fit(self, X, y=None): mean = y.mean() lower = mean - self.margin upper = mean + self.margin ys = sparse.csc_matrix(y[:, np.newaxis]) if self.weighted: x = X.multiply(ys).sum(axis=0) x = x / X.sum(axis=0) else: x = (X > 0) s = x.sum(axis=0) x = x.multiply(ys).sum(axis=0) / s x = np.array(x).flatten().astype('f4') mask1 = (x < lower) mask2 = (x > upper) self.mask = (mask1 + mask2).astype(bool) return self def transform(self, X): print(self.mask.sum()) return X[:, self.mask]