Source code for mriqc.classifier.data

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: oesteban
# @Date:   2015-11-19 16:44:27

"""
===================
Data handler module
===================

Reads in and writes CSV files with the IQMs


"""
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
from builtins import str

from .. import logging
from ..utils.misc import BIDS_COMP
LOG = logging.getLogger('mriqc.classifier')

[docs]def get_groups(X, label='site'): """Generate the index of sites""" groups = X[label].values.ravel().tolist() gnames = sorted(list(set(groups))) return [gnames.index(g) for g in groups], gnames
[docs]def combine_datasets(inputs, rating_label='rater_1'): mdata = [] for dataset_x, dataset_y, sitename in inputs: sitedata, _ = read_dataset( dataset_x, dataset_y, rate_label=rating_label, binarize=True, site_name=sitename) sitedata['database'] = [sitename] * len(sitedata) if 'site' not in sitedata.columns.ravel().tolist(): sitedata['site'] = [sitename] * len(sitedata) mdata.append(sitedata) mdata = pd.concat(mdata) all_cols = mdata.columns.ravel().tolist() bids_comps = list(BIDS_COMP.keys()) bids_comps_present = list(set(mdata.columns.ravel().tolist()) & set(bids_comps)) bids_comps_present = [bit for bit in bids_comps if bit in bids_comps_present] ordered_cols = bids_comps_present + ['database', 'site', 'rater_1'] ordered_cols += sorted(list(set(all_cols) - set(ordered_cols))) return mdata[ordered_cols]
[docs]def get_bids_cols(dataframe): """ Returns columns corresponding to BIDS bits """ bids_comps = list(BIDS_COMP.keys()) bids_comps_present = list(set(dataframe.columns.ravel().tolist()) & set(bids_comps)) return [bit for bit in bids_comps if bit in bids_comps_present]
[docs]def read_iqms(feat_file): """ Reads in the features """ bids_comps = list(BIDS_COMP.keys()) x_df = pd.read_csv(feat_file, index_col=False, dtype={col: str for col in bids_comps}) # Find present bids bits and sort by them bids_comps_present = list(set(x_df.columns.ravel().tolist()) & set(bids_comps)) bids_comps_present = [bit for bit in bids_comps if bit in bids_comps_present] x_df = x_df.sort_values(by=bids_comps_present) # Remove sub- prefix in subject_id x_df.subject_id = x_df.subject_id.str.lstrip('sub-') # Remove columns that are not IQMs feat_names = list(x_df._get_numeric_data().columns.ravel()) for col in bids_comps: try: feat_names.remove(col) except ValueError: pass for col in feat_names: if col.startswith(('size_', 'spacing_', 'Unnamed')): feat_names.remove(col) return x_df, feat_names, bids_comps_present
[docs]def read_labels(label_file, rate_label='rater_1', binarize=True, site_name=None, rate_selection='random', collapse=True): """ Reads in the labels. Massage labels table to have the appropriate format """ if isinstance(rate_label, str): rate_label = [rate_label] output_labels = rate_label bids_comps = list(BIDS_COMP.keys()) y_df = pd.read_csv(label_file, index_col=False, dtype={col: str for col in bids_comps}) # Find present bids bits and sort by them bids_comps_present = get_bids_cols(y_df) y_df = y_df.sort_values(by=bids_comps_present) y_df.subject_id = y_df.subject_id.str.lstrip('sub-') y_df[rate_label] = y_df[rate_label].apply(pd.to_numeric, errors='raise') if len(rate_label) == 2: np.random.seed(42) ratermask_1 = ~np.isnan(y_df[[rate_label[0]]].values.ravel()) ratermask_2 = ~np.isnan(y_df[[rate_label[1]]].values.ravel()) all_rated = (ratermask_1 & ratermask_2) mergey = np.array(y_df[[rate_label[0]]].values.ravel().tolist()) mergey[ratermask_2] = y_df[[rate_label[1]]].values.ravel()[ratermask_2] subsmpl = np.random.choice(np.where(all_rated)[0], int(0.5 * np.sum(all_rated)), replace=False) all_rated[subsmpl] = False mergey[all_rated] = y_df[[rate_label[0]]].values.ravel()[all_rated] y_df['merged_ratings'] = mergey.astype(int) # Set default name if collapse: cols = [('indv_%s' % c) if c.startswith('rater') else c for c in y_df.columns.ravel().tolist()] cols[y_df.columns.get_loc('merged_ratings')] = rate_label[0] y_df.columns = cols output_labels = [rate_label[0]] else: output_labels = rate_label output_labels.insert(0, 'merged_ratings') if binarize: mask = y_df[output_labels[0]] >= 0 y_df.loc[mask, output_labels[0]] = 0 y_df.loc[~mask, output_labels[0]] = 1 if 'site' in y_df.columns.ravel().tolist(): output_labels.insert(0, 'site') elif site_name is not None: y_df['site'] = [site_name] * len(y_df) output_labels.insert(0, 'site') return y_df[bids_comps_present + output_labels]
[docs]def read_dataset(feat_file, label_file, merged_name=None, binarize=True, site_name=None, rate_label='rater_1', rate_selection='random'): """ Reads in the features and labels """ x_df, feat_names, _ = read_iqms(feat_file) y_df = read_labels(label_file, rate_label, binarize, collapse=True, site_name=site_name, rate_selection=rate_selection) if isinstance(rate_label, (list, tuple)): rate_label = rate_label[0] # Find present bids bits and sort by them bids_comps = list(BIDS_COMP.keys()) bids_comps_x = list(set(x_df.columns.ravel().tolist()) & set(bids_comps)) bids_comps_x = [bit for bit in bids_comps if bit in bids_comps_x] bids_comps_y = list(set(x_df.columns.ravel().tolist()) & set(bids_comps)) bids_comps_y = [bit for bit in bids_comps if bit in bids_comps_y] if bids_comps_x != bids_comps_y: raise RuntimeError('Labels and features cannot be merged') x_df['bids_ids'] = x_df.subject_id.values.copy() y_df['bids_ids'] = y_df.subject_id.values.copy() for comp in bids_comps_x[1:]: x_df['bids_ids'] = x_df.bids_ids.str.cat(x_df.loc[:, comp].astype(str), sep='_') y_df['bids_ids'] = y_df.bids_ids.str.cat(y_df.loc[:, comp].astype(str), sep='_') # Remove failed cases from Y, append new columns to X y_df = y_df[y_df['bids_ids'].isin(list(x_df.bids_ids.values.ravel()))] # Drop indexing column del x_df['bids_ids'] del y_df['bids_ids'] # Merge Y dataframe into X x_df = pd.merge(x_df, y_df, on=bids_comps_x, how='left') if merged_name is not None: x_df.to_csv(merged_name, index=False) # Drop samples with invalid rating nan_labels = x_df[x_df[rate_label].isnull()].index.ravel().tolist() if nan_labels: LOG.info('Dropping %d samples for having non-numerical ' 'labels', len(nan_labels)) x_df = x_df.drop(nan_labels) # Print out some info nsamples = len(x_df) LOG.info('Created dataset X="%s", Y="%s" (N=%d valid samples)', feat_file, label_file, nsamples) # Inform about ratings distribution labels = sorted(list(set(x_df[rate_label].values.ravel().tolist()))) ldist = [] for l in labels: ldist.append(int(np.sum(x_df[rate_label] == l))) LOG.info('Ratings distribution: %s (%s, %s)', '/'.join(['%d' % x for x in ldist]), '/'.join(['%.2f%%' % (100 * x / nsamples) for x in ldist]), 'accept/exclude' if len(ldist) == 2 else 'exclude/doubtful/accept') return x_df, feat_names
[docs]def balanced_leaveout(dataframe, site_column='site', rate_label='rater_1'): sites = list(set(dataframe[[site_column]].values.ravel())) pos_draw = [] neg_draw = [] for site in sites: site_x = dataframe.loc[dataframe[site_column].str.contains(site)] site_x_pos = site_x[site_x[rate_label] == 1] if len(site_x_pos) > 4: pos_draw.append(np.random.choice(site_x_pos.index.tolist())) site_x_neg = site_x[site_x[rate_label] == 0] neg_draw.append(np.random.choice(site_x_neg.index.tolist())) left_out = dataframe.iloc[pos_draw + neg_draw].copy() dataframe = dataframe.drop(dataframe.index[pos_draw + neg_draw]) return dataframe, left_out
[docs]def zscore_dataset(dataframe, excl_columns=None, by='site', njobs=-1): """ Returns a dataset zscored by the column given as argument """ from multiprocessing import Pool, cpu_count LOG.info('z-scoring dataset ...') if njobs <= 0: njobs = cpu_count() sites = list(set(dataframe[[by]].values.ravel().tolist())) columns = list(dataframe.select_dtypes([np.number]).columns.ravel()) if excl_columns is None: excl_columns = [] for col in columns: if not np.isfinite(np.sum(dataframe[[col]].values.ravel())): excl_columns.append(col) if excl_columns: for col in excl_columns: try: columns.remove(col) except ValueError: pass zs_df = dataframe.copy() pool = Pool(njobs) args = [(zs_df, columns, s) for s in sites] results = pool.map(zscore_site, args) for site, res in zip(sites, results): zs_df.loc[zs_df.site == site, columns] = res zs_df.replace([np.inf, -np.inf], np.nan) nan_columns = zs_df.columns[zs_df.isnull().any()].tolist() if nan_columns: LOG.warn('Columns %s contain NaNs after z-scoring.', ", ".join(nan_columns)) zs_df[nan_columns] = dataframe[nan_columns].values return zs_df
[docs]def zscore_site(args): """ z-scores only one site """ from scipy.stats import zscore dataframe, columns, site = args return zscore(dataframe.loc[dataframe.site == site, columns].values, ddof=1, axis=0)