Source code for skll.data.writers

# License: BSD 3 clause
"""
Handles loading data from various types of data files.

:author: Dan Blanchard (dblanchard@ets.org)
:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""

from __future__ import absolute_import, print_function, unicode_literals

import json
import logging
import os
import re
import sys
from csv import DictWriter
from decimal import Decimal
from io import open

import numpy as np
from six import iteritems, PY2, string_types, text_type
from six.moves import map
from sklearn.feature_extraction import FeatureHasher


[docs]class Writer(object): """ Helper class for writing out FeatureSets to files on disk. Parameters ---------- path : str A path to the feature file we would like to create. The suffix to this filename must be ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``, ``.megam``, ``.ndj``, or ``.tsv``. If ``subsets`` is not ``None``, when calling the ``write()`` method, path is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the file. quiet : bool Do not print "Writing..." status message to stderr. Defaults to ``True``. requires_binary : bool Whether or not the Writer must open the file in binary mode for writing with Python 2. Defaults to ``False``. subsets : dict (str to list of str) A mapping from subset names to lists of feature names that are included in those sets. If given, a feature file will be written for every subset (with the name containing the subset name as suffix to ``path``). Note, since string- valued features are automatically converted into boolean features with names of the form ``FEATURE_NAME=STRING_VALUE``, when doing the filtering, the portion before the ``=`` is all that's used for matching. Therefore, you do not need to enumerate all of these boolean feature names in your mapping. Defaults to ``None``. logger : logging.Logger A logger instance to use to log messages instead of creating a new one by default. Defaults to ``None``. """ def __init__(self, path, feature_set, **kwargs): super(Writer, self).__init__() self.requires_binary = kwargs.pop('requires_binary', False) self.quiet = kwargs.pop('quiet', True) self.path = path self.feat_set = feature_set self.subsets = kwargs.pop('subsets', None) logger = kwargs.pop('logger', None) self.logger = logger if logger else logging.getLogger(__name__) # Get prefix & extension for checking file types & writing subset files # TODO: Determine if we purposefully used this instead of os.path.split self.root, self.ext = re.search(r'^(.*)(\.[^.]*)$', path).groups() self._progress_msg = '' if kwargs: raise ValueError('Passed extra keyword arguments to ' 'Writer constructor: {}'.format(kwargs))
[docs] @classmethod def for_path(cls, path, feature_set, **kwargs): """ Retrieve object of ``Writer`` sub-class that is appropriate for given path. Parameters ---------- path : str A path to the feature file we would like to create. The suffix to this filename must be ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``, ``.megam``, ``.ndj``, or ``.tsv``. If ``subsets`` is not ``None``, when calling the ``write()`` method, path is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. kwargs : dict The keyword arguments for ``for_path`` are the same as the initializer for the desired ``Writer`` subclass. Returns ------- writer : skll.data.writers.Writer New instance of the Writer sub-class that is appropriate for the given path. """ # Get lowercase extension for file extension checking ext = '.' + path.rsplit('.', 1)[-1].lower() return EXT_TO_WRITER[ext](path, feature_set, **kwargs)
[docs] def write(self): """ Writes out this Writer's ``FeatureSet`` to a file in its format. """ if isinstance(self.feat_set.vectorizer, FeatureHasher): raise ValueError('Writer cannot write sets that use' 'FeatureHasher for vectorization.') # Write one feature file if we weren't given a dict of subsets if self.subsets is None: self._write_subset(self.path, None) # Otherwise write one feature file per subset else: for subset_name, filter_features in iteritems(self.subsets): self.logger.debug('Subset ({}) features: {}'.format(subset_name, filter_features)) sub_path = os.path.join(self.root, '{}{}'.format(subset_name, self.ext)) self._write_subset(sub_path, set(filter_features))
def _write_subset(self, sub_path, filter_features): """ Writes out the given ``FeatureSet`` instance to a file in this class's format. Parameters ---------- sub_path : str The path to the file we want to create for this subset of our data. filter_features : set of str Set of features to include in current feature file. """ self.logger.debug('sub_path: %s', sub_path) self.logger.debug('feature_set: %s', self.feat_set.name) self.logger.debug('filter_features: %s', filter_features) if not self.quiet: self._progress_msg = "Writing {}...".format(sub_path) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() # Apply filtering filtered_set = (self.feat_set.filtered_iter(features=filter_features) if filter_features is not None else self.feat_set) # Open file for writing and write each line file_mode = 'wb' if (self.requires_binary and PY2) else 'w' with open(sub_path, file_mode) as output_file: # Write out the header if this format requires it self._write_header(filtered_set, output_file, filter_features) # Write individual lines for ex_num, (id_, label_, feat_dict) in enumerate(filtered_set): self._write_line(id_, label_, feat_dict, output_file) if not self.quiet and ex_num % 100 == 0: print("{}{:>15}".format(self._progress_msg, ex_num), end="\r", file=sys.stderr) sys.stderr.flush() if not self.quiet: print("{}{:<15}".format(self._progress_msg, "done"), file=sys.stderr) sys.stderr.flush() def _write_header(self, feature_set, output_file, filter_features): """ Called before lines are written to file, so that headers can be written for files that need them. Parameters ---------- feature_set : skll.FeatureSet The ``FeatureSet`` instance being written to a file. output_file : file buffer The file being written to. filter_features : set of str If only writing a subset of the features in the FeatureSet to ``output_file``, these are the features to include in this file. """ pass def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in this Writer's format. Parameters ---------- id_ : str The ID for the current instance. label_ : str The label for the current instance. feat_dict : dict The feature dictionary for the current instance. output_file : file buff The file being written to. Raises ------ NotImplementedError """ raise NotImplementedError
[docs]class DelimitedFileWriter(Writer): """ Writer for writing out FeatureSets as TSV/CSV files. Parameters ---------- path : str A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. quiet : bool Do not print "Writing..." status message to stderr. Defaults to ``True``. label_col : str Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or ``None`` is specified, the data is considered to be unlabelled. Defaults to ``'y'``. id_col : str Name of the column which contains the instance IDs. If no column with that name exists, or ``None`` is specified, example IDs will be automatically generated. Defaults to ``'id'``. dialect : str Name of the column which contains the class labels for CSV/TSV files. logger : logging.Logger A logger instance to use to log messages instead of creating a new one by default. Defaults to ``None``. kwargs : dict, optional The arguments to the ``Writer`` object being instantiated. """ def __init__(self, path, feature_set, **kwargs): kwargs['requires_binary'] = True self.dialect = kwargs.pop('dialect', 'excel-tab') self.label_col = kwargs.pop('label_col', 'y') self.id_col = kwargs.pop('id_col', 'id') super(DelimitedFileWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None def _get_fieldnames(self, filter_features): """ Build list of field names for DictWriter from self.feat_set. Parameters ---------- filter_features : set of str Set of features to include in current feature file. Returns ------- fieldnames : list of str A list of alphabetically sorted field names. """ # Build list of fieldnames (features + id_col + label_col) if filter_features is not None: fieldnames = {feat_name for feat_name in self.feat_set.vectorizer.get_feature_names() if (feat_name in filter_features or feat_name.split('=', 1)[0] in filter_features)} else: fieldnames = set(self.feat_set.vectorizer.get_feature_names()) fieldnames.add(self.id_col) if self.feat_set.has_labels: fieldnames.add(self.label_col) return sorted(fieldnames) def _write_header(self, feature_set, output_file, filter_features): """ Called before lines are written to file, so that headers can be written for files that need them. Parameters ---------- feature_set : skll.FeatureSet The ``FeatureSet`` instance being written to a file. output_file : file buffer The file being written to. filter_features : set of str If only writing a subset of the features in the ``FeatureSet`` instance to ``output_file``, these are the features to include in this file. """ # Initialize DictWriter that will be used to write header and rows self._dict_writer = DictWriter(output_file, self._get_fieldnames(filter_features), restval=0, dialect=self.dialect) # Actually output the header to the file self._dict_writer.writeheader() def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in this Writer's format. Parameters ---------- id_ : str The ID for the current instance. label_ : str The label for the current instance. feat_dict : dict The feature dictionary for the current instance. output_file : file buffer The file being written to. Raises ------ ValueError If class column name is already use as a feature ValueError If ID column name is already used as a feature. """ # Add class column to feat_dict (unless this is unlabelled data) if self.label_col not in feat_dict: if self.feat_set.has_labels: feat_dict[self.label_col] = label_ else: raise ValueError(('Class column name "{}" already used as feature ' 'name.').format(self.label_col)) # Add id column to feat_dict if id is provided if self.id_col not in feat_dict: feat_dict[self.id_col] = id_ else: raise ValueError('ID column name "{}" already used as feature ' 'name.'.format(self.id_col)) # Write out line self._dict_writer.writerow(feat_dict)
[docs]class CSVWriter(DelimitedFileWriter): """ Writer for writing out ``FeatureSet`` instances as CSV files. Parameters ---------- path : str A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.csv``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. kwargs : dict, optional The arguments to the ``Writer`` object being instantiated. """ def __init__(self, path, feature_set, **kwargs): kwargs['dialect'] = 'excel' super(CSVWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None
[docs]class TSVWriter(DelimitedFileWriter): """ Writer for writing out FeatureSets as TSV files. Parameters ---------- path : str A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.tsv``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. kwargs : dict, optional The arguments to the ``Writer`` object being instantiated. """ def __init__(self, path, feature_set, **kwargs): kwargs['dialect'] = 'excel-tab' super(TSVWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None
[docs]class ARFFWriter(DelimitedFileWriter): """ Writer for writing out FeatureSets as ARFF files. Parameters ---------- path : str A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.arff``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. relation : str, optional The name of the relation in the ARFF file. Defaults to ``'skll_relation'``. regression : bool, optional Is this an ARFF file to be used for regression? Defaults to ``False``. kwargs : dict, optional The arguments to the ``Writer`` object being instantiated. """ def __init__(self, path, feature_set, **kwargs): self.relation = kwargs.pop('relation', 'skll_relation') self.regression = kwargs.pop('regression', False) kwargs['dialect'] = 'arff' super(ARFFWriter, self).__init__(path, feature_set, **kwargs) self._dict_writer = None def _write_header(self, feature_set, output_file, filter_features): """ Called before lines are written to file, so that headers can be written for files that need them. Parameters ---------- feature_set : skll.FeatureSet The FeatureSet being written to a file. output_file : file buffer The file being written to. filter_features : set of str If only writing a subset of the features in the FeatureSet to ``output_file``, these are the features to include in this file. """ fieldnames = self._get_fieldnames(filter_features) if self.label_col in fieldnames: fieldnames.remove(self.label_col) # Add relation to header print("@relation '{}'\n".format(self.relation), file=output_file) # Loop through fields writing the header info for the ARFF file for field in fieldnames: print("@attribute '{}' numeric".format(field.replace('\\', '\\\\') .replace("'", "\\'")), file=output_file) # Print class label header if necessary if self.regression: print("@attribute {} numeric".format(self.label_col), file=output_file) else: if self.feat_set.has_labels: print("@attribute {} ".format(self.label_col) + "{" + ','.join(map(str, sorted(set(self.feat_set.labels)))) + "}", file=output_file) fieldnames.append(self.label_col) # Create CSV writer to handle missing values for lines in data section # and to ignore the instance values for non-numeric attributes self._dict_writer = DictWriter(output_file, fieldnames, restval=0, extrasaction='ignore', dialect='arff') # Finish header and start data section print("\n@data", file=output_file)
[docs]class MegaMWriter(Writer): """ Writer for writing out FeatureSets as MegaM files. """ @staticmethod def _replace_non_ascii(line): """ Parameters ---------- line : str The line to clean up. Returns ------- char_list : str Copy of line with all non-ASCII characters replaced with <U1234> sequences where 1234 is the value of ord() for the character. """ char_list = [] for char in line: char_num = ord(char) char_list.append( '<U{}>'.format(char_num) if char_num > 127 else char) return ''.join(char_list) def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in MegaM format. Parameters ---------- id_ : str The ID for the current instance. label_ : str The label for the current instance. feat_dict : dict The feature dictionary for the current instance. output_file : file buffer The file being written to. """ # Don't try to add class column if this is label-less data print('# {}'.format(id_), file=output_file) if self.feat_set.has_labels: print('{}'.format(label_), end='\t', file=output_file) print(self._replace_non_ascii(' '.join(('{} {}'.format(field, value) for field, value in sorted(feat_dict.items()) if Decimal(value) != 0))), file=output_file)
[docs]class NDJWriter(Writer): """ Writer for writing out FeatureSets as .jsonlines/.ndj files. Parameters ---------- path : str A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.ndj``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. kwargs : dict, optional The arguments to the ``Writer`` object being instantiated. """ def __init__(self, path, feature_set, **kwargs): kwargs['requires_binary'] = True super(NDJWriter, self).__init__(path, feature_set, **kwargs) def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in NDJ format. Parameters ---------- id_ : str The ID for the current instance. label_ : str The label for the current instance. feat_dict : dict The feature dictionary for the current instance. output_file : file buffer The file being written to. """ example_dict = {} # Don't try to add class column if this is label-less data if self.feat_set.has_labels: example_dict['y'] = np.asscalar(label_) example_dict['id'] = np.asscalar(id_) example_dict["x"] = feat_dict print(json.dumps(example_dict, sort_keys=True), file=output_file)
[docs]class LibSVMWriter(Writer): """ Writer for writing out FeatureSets as LibSVM/SVMLight files. Parameters ---------- path : str A path to the feature file we would like to create. If ``subsets`` is not ``None``, this is assumed to be a string containing the path to the directory to write the feature files with an additional file extension specifying the file type. For example ``/foo/.libsvm``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. kwargs : dict, optional The arguments to the ``Writer`` object being instantiated. """ LIBSVM_REPLACE_DICT = {':': '\u2236', '#': '\uFF03', ' ': '\u2002', '=': '\ua78a', '|': '\u2223'} def __init__(self, path, feature_set, **kwargs): self.label_map = kwargs.pop('label_map', None) super(LibSVMWriter, self).__init__(path, feature_set, **kwargs) if self.label_map is None: self.label_map = {} if feature_set.has_labels: self.label_map = {label: num for num, label in enumerate(sorted({label for label in feature_set.labels if not isinstance(label, (int, float))}))} # Add fake item to vectorizer for None self.label_map[None] = '00000' @staticmethod def _sanitize(name): """ Replace illegal characters in class names with close unicode equivalents to make things loadable in by LibSVM, LibLinear, or SVMLight. Parameters ---------- name : str The class names to replace with unicode equivalents. Returns ------- name : str The class names with unicode equivalent replacements. """ if isinstance(name, string_types): for orig, replacement in LibSVMWriter.LIBSVM_REPLACE_DICT.items(): name = name.replace(orig, replacement) return name def _write_line(self, id_, label_, feat_dict, output_file): """ Write the current line in the file in MegaM format. Parameters ---------- id_ : str The ID for the current instance. label_ : str The label for the current instance. feat_dict : dict The feature dictionary for the current instance. output_file : file buffer The file being written to. """ field_values = sorted([(self.feat_set.vectorizer.vocabulary_[field] + 1, value) for field, value in iteritems(feat_dict) if Decimal(value) != 0]) # Print label if label_ in self.label_map: print('{}'.format(self.label_map[label_]), end=' ', file=output_file) else: print('{}'.format(label_), end=' ', file=output_file) # Print features print(' '.join(('{}:{}'.format(field, value) for field, value in field_values)), end=' ', file=output_file) # Print comment with id and mappings print('#', end=' ', file=output_file) print(self._sanitize('{}'.format(id_)), end='', file=output_file) print(' |', end=' ', file=output_file) if (PY2 and self.feat_set.has_labels and isinstance(label_, text_type)): label_ = label_.encode('utf-8') if label_ in self.label_map: print('%s=%s' % (self._sanitize(self.label_map[label_]), self._sanitize(label_)), end=' | ', file=output_file) else: print(' |', end=' ', file=output_file) line = ' '.join(('%s=%s' % (self.feat_set.vectorizer.vocabulary_[field] + 1, self._sanitize(field)) for field, value in feat_dict.items() if Decimal(value) != 0)) print(line, file=output_file)
# Constants EXT_TO_WRITER = {".arff": ARFFWriter, ".csv": CSVWriter, ".jsonlines": NDJWriter, ".libsvm": LibSVMWriter, ".megam": MegaMWriter, '.ndj': NDJWriter, ".tsv": TSVWriter}