Source code for kaggler.data_io

import csv
import datetime
import h5py
import heapq
from io import open
import json
from logging import getLogger
import numpy as np
import os
import pickle
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
from scipy import sparse
import time


logger = getLogger(__name__)


[docs]def is_number(s): """Check if a string is a number or not.""" try: float(s) return True except ValueError: return False
[docs]def save_data(X, y, path): """Save data as a CSV, LibSVM or HDF5 file based on the file extension. Args: X (numpy or scipy sparse matrix): Data matrix y (numpy array): Target vector. If None, all zero vector will be saved. path (str): Path to the CSV, LibSVM or HDF5 file to save data. """ catalog = {'.csv': save_csv, '.sps': save_libsvm, '.h5': save_hdf5} ext = os.path.splitext(path)[1] func = catalog[ext] if y is None: y = np.zeros((X.shape[0], )) func(X, y, path)
[docs]def save_csv(X, y, path): """Save data as a CSV file. Args: X (numpy or scipy sparse matrix): Data matrix y (numpy array): Target vector. path (str): Path to the CSV file to save data. """ if sparse.issparse(X): X = X.todense() np.savetxt(path, np.hstack((y.reshape((-1, 1)), X)), delimiter=',')
[docs]def save_libsvm(X, y, path): """Save data as a LibSVM file. Args: X (numpy or scipy sparse matrix): Data matrix y (numpy array): Target vector. path (str): Path to the CSV file to save data. """ dump_svmlight_file(X, y, path, zero_based=False)
[docs]def save_hdf5(X, y, path): """Save data as a HDF5 file. Args: X (numpy or scipy sparse matrix): Data matrix y (numpy array): Target vector. path (str): Path to the HDF5 file to save data. """ with h5py.File(path, 'w') as f: is_sparse = 1 if sparse.issparse(X) else 0 f['issparse'] = is_sparse f['target'] = y if is_sparse: if not sparse.isspmatrix_csr(X): X = X.tocsr() f['shape'] = np.array(X.shape) f['data'] = X.data f['indices'] = X.indices f['indptr'] = X.indptr else: f['data'] = X
[docs]def load_data(path, dense=False): """Load data from a CSV, LibSVM or HDF5 file based on the file extension. Args: path (str): A path to the CSV, LibSVM or HDF5 format file. dense (boolean): An optional variable indicating if the return matrix should be dense. By default, it is false. Returns: Data matrix X and target vector y """ catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5} ext = os.path.splitext(path)[1] func = catalog[ext] X, y = func(path) if dense and sparse.issparse(X): X = X.todense() return X, y
[docs]def load_csv(path): """Load data from a CSV file. Args: path (str): A path to the CSV format file containing data. dense (boolean): An optional variable indicating if the return matrix should be dense. By default, it is false. Returns: Data matrix X and target vector y """ with open(path) as f: line = f.readline().strip() X = np.loadtxt(path, delimiter=',', skiprows=0 if is_number(line.split(',')[0]) else 1) y = np.array(X[:, 0]).flatten() X = X[:, 1:] return X, y
[docs]def load_hdf5(path): """Load data from a HDF5 file. Args: path (str): A path to the HDF5 format file containing data. dense (boolean): An optional variable indicating if the return matrix should be dense. By default, it is false. Returns: Data matrix X and target vector y """ with h5py.File(path, 'r') as f: is_sparse = f['issparse'][...] if is_sparse: shape = tuple(f['shape'][...]) data = f['data'][...] indices = f['indices'][...] indptr = f['indptr'][...] X = sparse.csr_matrix((data, indices, indptr), shape=shape) else: X = f['data'][...] y = f['target'][...] return X, y
[docs]def read_sps(path): """Read a LibSVM file line-by-line. Args: path (str): A path to the LibSVM file to read. Yields: data (list) and target (int). """ for line in open(path): # parse x xs = line.rstrip().split(' ') yield xs[1:], int(xs[0])
def shuf_file(f, shuf_win): heap = [] for line in f: key = hash(line) if len(heap) < shuf_win: heapq.heappush(heap, (key, line)) else: _, out = heapq.heappushpop(heap, (key, line)) yield out while len(heap) > 0: _, out = heapq.heappop(heap) yield out
[docs]class PathJoiner: """Load directory names from SETTINGS.json. Originally written by Baris Umog (https://www.kaggle.com/barisumog). Usage: # In SETTINGS.json, "data": "/path/to/data/". # To load "/path/to/data/targets.array" file to y: PATH = PathJoiner() y = load(PATH.data('targets.array')) """ def __init__(self, filename='SETTINGS.json'): with open(filename) as file: self.subdirs = json.load(file) def __getattr__(self, attr): subdir = self.subdirs[attr] return lambda *dirs: os.path.join(subdir, *dirs)
def stream_lines(filename, encoding='utf-8', ignore_errors=False): errors = 'ignore' if ignore_errors else 'strict' with open(filename, encoding=encoding, errors=errors) as file: for line in file: yield line def stream_csv(filename, encoding='utf-8', ignore_errors=False): stream = stream_lines(filename, encoding, ignore_errors) return csv.reader(stream) def limit_stream(stream, count=1, skip=0): for i in range(skip): next(stream) for i in range(count): yield next(stream) def save_obj(filename, obj): with open(filename, 'wb') as file: pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL) logger.info('saved : {}\t{}'.format(filename, type(obj))) def load_obj(filename): with open(filename, 'rb') as file: obj = pickle.load(file) logger.info('loaded : {}\t{}'.format(filename, type(obj))) return obj def save_array(filename, X): with h5py.File(filename, 'w') as file: file['data'] = X logger.info('saved : {}\t{}\t{}'.format(filename, X.dtype, X.shape)) def load_array(filename): with h5py.File(filename, 'r') as file: X = file['data'][...] logger.info('loaded : {}\t{}\t{}'.format(filename, X.dtype, X.shape)) return X def save_sparse(filename, X): with h5py.File(filename, 'w') as file: file['shape'] = np.array(X.shape) file['data'] = X.data file['indices'] = X.indices file['indptr'] = X.indptr logger.info('saved : {}\t{}\t{}'.format(filename, X.dtype, X.shape)) def load_sparse(filename): with h5py.File(filename, 'r') as file: shape = tuple(file['shape'][...]) data = file['data'][...] indices = file['indices'][...] indptr = file['indptr'][...] X = sparse.csr_matrix((data, indices, indptr), shape=shape) logger.info('loaded : {}\t{}\t{}'.format(filename, X.dtype, X.shape)) return X def save(filename, X): catalog = {'obj': save_obj, 'array': save_array, 'sparse': save_sparse} extension = filename.split('.')[-1] func = catalog[extension] func(filename, X) def load(filename): catalog = {'obj': load_obj, 'array': load_array, 'sparse': load_sparse} extension = filename.split('.')[-1] func = catalog[extension] X = func(filename) return X class Clock(object): def __init__(self): self.start = time.time() self.last = self.start self.now = self.start self.report() def check(self): self.now = time.time() self.report() self.last = self.now def report(self): txt = '\n[CLOCK] [ {} ] ' txt += 'since start: [ {} ] since last: [ {} ]\n' current = time.asctime().split()[3] since_start = datetime.timedelta(seconds=round(self.now - self.start)) since_last = datetime.timedelta(seconds=round(self.now - self.last)) logger.info(txt.format(current, since_start, since_last)) def beep(n=1): for _ in range(n): os.system('beep') def print_shape_type(*objs): for obj in objs: try: logger.info(obj.shape, obj.dtype, type(obj)) except AttributeError: logger.error(obj.shape, type(obj))