import csv
import datetime
import h5py
import heapq
from io import open
import json
from logging import getLogger
import numpy as np
import os
import pickle
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
from scipy import sparse
import time
logger = getLogger(__name__)
[docs]def is_number(s):
"""Check if a string is a number or not."""
try:
float(s)
return True
except ValueError:
return False
[docs]def save_data(X, y, path):
"""Save data as a CSV, LibSVM or HDF5 file based on the file extension.
Args:
X (numpy or scipy sparse matrix): Data matrix
y (numpy array): Target vector. If None, all zero vector will be saved.
path (str): Path to the CSV, LibSVM or HDF5 file to save data.
"""
catalog = {'.csv': save_csv, '.sps': save_libsvm, '.h5': save_hdf5}
ext = os.path.splitext(path)[1]
func = catalog[ext]
if y is None:
y = np.zeros((X.shape[0], ))
func(X, y, path)
[docs]def save_csv(X, y, path):
"""Save data as a CSV file.
Args:
X (numpy or scipy sparse matrix): Data matrix
y (numpy array): Target vector.
path (str): Path to the CSV file to save data.
"""
if sparse.issparse(X):
X = X.todense()
np.savetxt(path, np.hstack((y.reshape((-1, 1)), X)), delimiter=',')
[docs]def save_libsvm(X, y, path):
"""Save data as a LibSVM file.
Args:
X (numpy or scipy sparse matrix): Data matrix
y (numpy array): Target vector.
path (str): Path to the CSV file to save data.
"""
dump_svmlight_file(X, y, path, zero_based=False)
[docs]def save_hdf5(X, y, path):
"""Save data as a HDF5 file.
Args:
X (numpy or scipy sparse matrix): Data matrix
y (numpy array): Target vector.
path (str): Path to the HDF5 file to save data.
"""
with h5py.File(path, 'w') as f:
is_sparse = 1 if sparse.issparse(X) else 0
f['issparse'] = is_sparse
f['target'] = y
if is_sparse:
if not sparse.isspmatrix_csr(X):
X = X.tocsr()
f['shape'] = np.array(X.shape)
f['data'] = X.data
f['indices'] = X.indices
f['indptr'] = X.indptr
else:
f['data'] = X
[docs]def load_data(path, dense=False):
"""Load data from a CSV, LibSVM or HDF5 file based on the file extension.
Args:
path (str): A path to the CSV, LibSVM or HDF5 format file.
dense (boolean): An optional variable indicating if the return matrix
should be dense. By default, it is false.
Returns:
Data matrix X and target vector y
"""
catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}
ext = os.path.splitext(path)[1]
func = catalog[ext]
X, y = func(path)
if dense and sparse.issparse(X):
X = X.todense()
return X, y
[docs]def load_csv(path):
"""Load data from a CSV file.
Args:
path (str): A path to the CSV format file containing data.
dense (boolean): An optional variable indicating if the return matrix
should be dense. By default, it is false.
Returns:
Data matrix X and target vector y
"""
with open(path) as f:
line = f.readline().strip()
X = np.loadtxt(path, delimiter=',',
skiprows=0 if is_number(line.split(',')[0]) else 1)
y = np.array(X[:, 0]).flatten()
X = X[:, 1:]
return X, y
[docs]def load_hdf5(path):
"""Load data from a HDF5 file.
Args:
path (str): A path to the HDF5 format file containing data.
dense (boolean): An optional variable indicating if the return matrix
should be dense. By default, it is false.
Returns:
Data matrix X and target vector y
"""
with h5py.File(path, 'r') as f:
is_sparse = f['issparse'][...]
if is_sparse:
shape = tuple(f['shape'][...])
data = f['data'][...]
indices = f['indices'][...]
indptr = f['indptr'][...]
X = sparse.csr_matrix((data, indices, indptr), shape=shape)
else:
X = f['data'][...]
y = f['target'][...]
return X, y
[docs]def read_sps(path):
"""Read a LibSVM file line-by-line.
Args:
path (str): A path to the LibSVM file to read.
Yields:
data (list) and target (int).
"""
for line in open(path):
# parse x
xs = line.rstrip().split(' ')
yield xs[1:], int(xs[0])
def shuf_file(f, shuf_win):
heap = []
for line in f:
key = hash(line)
if len(heap) < shuf_win:
heapq.heappush(heap, (key, line))
else:
_, out = heapq.heappushpop(heap, (key, line))
yield out
while len(heap) > 0:
_, out = heapq.heappop(heap)
yield out
[docs]class PathJoiner:
"""Load directory names from SETTINGS.json.
Originally written by Baris Umog (https://www.kaggle.com/barisumog).
Usage:
# In SETTINGS.json, "data": "/path/to/data/".
# To load "/path/to/data/targets.array" file to y:
PATH = PathJoiner()
y = load(PATH.data('targets.array'))
"""
def __init__(self, filename='SETTINGS.json'):
with open(filename) as file:
self.subdirs = json.load(file)
def __getattr__(self, attr):
subdir = self.subdirs[attr]
return lambda *dirs: os.path.join(subdir, *dirs)
def stream_lines(filename, encoding='utf-8', ignore_errors=False):
errors = 'ignore' if ignore_errors else 'strict'
with open(filename, encoding=encoding, errors=errors) as file:
for line in file:
yield line
def stream_csv(filename, encoding='utf-8', ignore_errors=False):
stream = stream_lines(filename, encoding, ignore_errors)
return csv.reader(stream)
def limit_stream(stream, count=1, skip=0):
for i in range(skip):
next(stream)
for i in range(count):
yield next(stream)
def save_obj(filename, obj):
with open(filename, 'wb') as file:
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
logger.info('saved : {}\t{}'.format(filename, type(obj)))
def load_obj(filename):
with open(filename, 'rb') as file:
obj = pickle.load(file)
logger.info('loaded : {}\t{}'.format(filename, type(obj)))
return obj
def save_array(filename, X):
with h5py.File(filename, 'w') as file:
file['data'] = X
logger.info('saved : {}\t{}\t{}'.format(filename, X.dtype, X.shape))
def load_array(filename):
with h5py.File(filename, 'r') as file:
X = file['data'][...]
logger.info('loaded : {}\t{}\t{}'.format(filename, X.dtype, X.shape))
return X
def save_sparse(filename, X):
with h5py.File(filename, 'w') as file:
file['shape'] = np.array(X.shape)
file['data'] = X.data
file['indices'] = X.indices
file['indptr'] = X.indptr
logger.info('saved : {}\t{}\t{}'.format(filename, X.dtype, X.shape))
def load_sparse(filename):
with h5py.File(filename, 'r') as file:
shape = tuple(file['shape'][...])
data = file['data'][...]
indices = file['indices'][...]
indptr = file['indptr'][...]
X = sparse.csr_matrix((data, indices, indptr), shape=shape)
logger.info('loaded : {}\t{}\t{}'.format(filename, X.dtype, X.shape))
return X
def save(filename, X):
catalog = {'obj': save_obj, 'array': save_array, 'sparse': save_sparse}
extension = filename.split('.')[-1]
func = catalog[extension]
func(filename, X)
def load(filename):
catalog = {'obj': load_obj, 'array': load_array, 'sparse': load_sparse}
extension = filename.split('.')[-1]
func = catalog[extension]
X = func(filename)
return X
class Clock(object):
def __init__(self):
self.start = time.time()
self.last = self.start
self.now = self.start
self.report()
def check(self):
self.now = time.time()
self.report()
self.last = self.now
def report(self):
txt = '\n[CLOCK] [ {} ] '
txt += 'since start: [ {} ] since last: [ {} ]\n'
current = time.asctime().split()[3]
since_start = datetime.timedelta(seconds=round(self.now - self.start))
since_last = datetime.timedelta(seconds=round(self.now - self.last))
logger.info(txt.format(current, since_start, since_last))
def beep(n=1):
for _ in range(n):
os.system('beep')
def print_shape_type(*objs):
for obj in objs:
try:
logger.info(obj.shape, obj.dtype, type(obj))
except AttributeError:
logger.error(obj.shape, type(obj))