Source code for recipipe.core


import collections
import inspect

import sklearn.pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

from recipipe.utils import add_to_map_dict
from recipipe.utils import flatten_list
from recipipe.utils import fit_columns


[docs]class Recipipe(sklearn.pipeline.Pipeline): """Recipipe pipeline. A Recipipe pipeline is an extension of an SKLearn pipeline. It adds some functionality that make the creation of pipelines less painful. For example, the `steps` param is not required at the construction time. You can add your transformers to the pipeline anytime using :obj:`recipipe.core.Recipipe.add`. Attr: Same attributes as :obj:`sklearn.pipeline.Pipeline`. """
[docs] def __init__(self, steps=None, **kwargs): """Create a Recipipe pipeline. Args: steps (:obj:`list`): Same as in :obj:`sklearn.pipeline.Pipeline`. kwargs: Same as in :obj:`sklearn.pipeline.Pipeline`: `memory` and `verbose`. """ self.steps = [] if steps: for i in steps: self.add(i) else: # Mock validate steps to avoid empty list validation. # This method depends a lot of the hidden representation of the # Pipeline class but it's better than using a dummy empty # transformer. # An empty transformer adds more complexity to make methods like # __len__ work properly. Also it's impossible to control the use # of self.steps from outside of the class. aux = self._validate_steps self._validate_steps = lambda: True super().__init__(self.steps, **kwargs) # Unmock validation method if needed. if not steps: self._validate_steps = aux
[docs] def __add__(self, transformer): """Add a new step to the pipeline using the '+' operator. Note that this is exactly the same as calling :obj:`recipipe.core.Recipipe.add`. The Recipipe object is going to be modified, that is `p = p + t` is the same as `p + t`, where `p` is any Recipipe pipeline and `t` is any transformer. See Also: :obj:`recipipe.core.Recipipe.add` """ return self.add(transformer)
[docs] def add(self, step): """Add a new step to the pipeline. You can add steps even if the pipeline is already fitted, so be careful. Args: step (Transformer or tuple(`str`, Transformer)): The new step that you want to add to the pipeline. Any transformer is good (SKLearn transformer or :obj:`recipipe.core.RecipipeTransformer`). If a tuple is given, the fist element of the tuple is going to be used as a name for the step in the pipeline. Returns: The pipeline. You can chain `add` methods: `pipe.add(...).add(...)...`. See Also: :obj:`recipipe.core.Recipipe.__add__` """ if type(step) is not tuple: transformer = step if getattr(transformer, "name", None): name = transformer.name else: idx = len(self.steps) name = f"step{idx:02d}" step = (name, transformer) self.steps.append(step) return self
""" The next lines are really nasty, but they avoid an error with: from sklearn import set_config set_config(display='diagram') <any-recipipe-transformer-here> executed in a notebook. The reason to overwrite an SKLearn function is because we have params that do not need to be used as __init__ params. That function avoids showing all the params of an estimator, only shows those params with a non-default value. Overwriting it will print extra information, but nothing bad (I suppose :). """ import sklearn.utils._pprint sklearn.utils._pprint._changed_params = lambda e: e.get_params(deep=False)
[docs]class RecipipeTransformer(BaseEstimator, TransformerMixin): """Base class of all Recipipe transformers. Attributes: name (str): Human-friendly name of the transformer. """ @classmethod def _get_param_names(cls): """Get parameter names for the estimator. Taken from SKLearn with some extra modifications to allow the use of var positional arguments (*args) in estimators. My modifications are indicated by comments with two ##. """ parent_params = [] if cls.__name__ != RecipipeTransformer.__name__: for i in cls.__bases__: parent_params += i._get_param_names() # fetch the constructor or the original constructor before # deprecation wrapping if any init = getattr(cls.__init__, 'deprecated_original', cls.__init__) if init is object.__init__: # pragma: no cover ## I don't know how to force this situation for testing... # No explicit constructor to introspect return [] # introspect the constructor arguments to find the model parameters # to represent init_signature = inspect.signature(init) # Consider the constructor parameters excluding 'self' ## Taking p.name instead of p. parameters = [p.name for p in init_signature.parameters.values() if p.name != 'self' and p.kind != p.VAR_KEYWORD ## No positional vars (*args). and p.kind != p.VAR_POSITIONAL] ## No RuntimeError raised here! ## Adding super params. parameters += parent_params parameters = set(parameters) # Extract and sort argument names excluding 'self' return sorted(parameters)
[docs] def __init__(self, *args, cols_init=None, exclude=None, dtype=None, name=None, keep_original=False, col_format="{}", cols_not_found_error=False): """Create a new transformer. Columns names can be use Unix filename pattern matching ( :obj:`fnmatch`). Args: *args (:obj:`list` of :obj:`str`): List of columns the transformer will work on. cols_init (:obj:`list` of :obj:`str`): List of columns the transformer will work on. If `*args` are provided, this list of columns is going to be appended at the end. exclude (:obj:`list` of :obj:`str`): List of columns to exclude. The exclusion is applied after fitting the columns, so it can be used at the same time as `*args` and `col_init`. dtype (:obj:`numpy.dtype`, :obj:`str`, :obj:`list` of :obj:`numpy.dtype` or with :obj:`str` or :obj:`dict`): This value is passed to :obj:`pandas.DataFrame.select_dtypes`. If a :obj:`dict` is given, the Pandas function is going to be called with dictionary unpacking: `select_dtypes(**dtype)`. In this way you can exclude, for example, int dtypes using: `dtype=dict(exclude=int)`. The columns returned by this method (executed in the DataFrame passed to the fit method) will be the columns that are going to be used in the transformation phase. When used in combination with `*args` or `cols_init`, the dtype filter is applied later. name (:obj:`str`): Human-friendly name of the transformer. keep_original (:obj:`bool`): `True` if you want to keep the input columns used in the transformer in the transformed DataFrame, `False` if not. Note that, if the output column has the same name as the input column, the output input column will not be included even if `keep_original` is set to `True`. Default: `False`. col_format (:obj:`str`): New name of the columns. Use "{}" in to substitute that placeholder by the column name. For example, if you want to append the string "_new" at the end of all the generated columns you must set `col_format="{}_new"`. Default: "{}". cols_not_found_error (:obj:`bool`): Raise an error if the isn't any match for any of the specified columns. Default: `False`. """ if cols_init: args = (args, cols_init) cols_init = flatten_list(args) # Set values. self.cols_init = cols_init self.exclude = flatten_list(exclude or []) self.dtype = dtype self.keep_original = keep_original self.name = name self.col_format = col_format self.col_map = None # set in fit self.cols = None # fitted columns, set in fit self.cols_out = None # set in fit self.cols_not_found_error = cols_not_found_error self.cols_in_out = None # set in fit
def _fit(self, df): # pragma: no cover """Your fit code should be here. Args: df (:obj:`pandas.DataFrame`): DataFrame used for fitting. """ pass def _transform(self, df): # pragma: no cover """Your transform code should be here. Abstract method that you should overwrite in your classes. Remember that any transformation done here should be consistent with the column mapping returned by :obj:`recipipe.core.RecipipeTransformer.get_column_mapping`. Ex: if your column mapping is `{"c1": "c1"}` do not return in this method a DataFrame with columns `c1` and `c2`. Args: df (:obj:`pandas.DataFrame`): DataFrame to transform. Return: The transformer DataFrame. """ return df def _inverse_transform(self, df): # pragma: no cover return df
[docs] def fit(self, df, y=None): """Fit the transformer. Args: df (pandas.DataFrame): Dataframe used to fit the transformation. """ cols = fit_columns(df, self.cols_init, self.dtype, self.cols_not_found_error) exclude = fit_columns(df, self.exclude, None, self.cols_not_found_error) if self.exclude else [] self.cols = [i for i in cols if i not in exclude] self._fit(df) # Save column maps and lists. self.col_map = self.get_column_mapping() # Recreate cols, just in case you overwrite the get_column_mapping # but you didn't specify any cols_init. self.cols = list(collections.OrderedDict.fromkeys( flatten_list(self.col_map.keys()))) col_map_1_n, col_map_1_n_inverse = {}, {} for k, v in self.col_map.items(): add_to_map_dict(col_map_1_n, k, v) add_to_map_dict(col_map_1_n_inverse, v, k) self.col_map_1_n = col_map_1_n self.col_map_1_n_inverse = col_map_1_n_inverse self.cols_out = list(collections.OrderedDict.fromkeys( flatten_list(self.col_map.values()))) # Cols in input columns and output columns should be removed from # df_in during the transform phase. # We join df_in with df_out, so we do not want duplicate column names. self.cols_in_out = set(self.cols).intersection(set(self.cols_out)) if self.keep_original and self.cols_in_out: raise ValueError("Rename the output columns if you want to keep " "the original columns, name collisions in " f"{self.cols_in_out}") return self
[docs] def transform(self, df_in): """Transform DataFrame. Args: df_in (:obj:`pandas.DataFrame`): Input DataFrame. Returns: Transformed DataFrame. Raise: :obj:`ValueError` if not `cols` fitted. Fit the transform to avoid this error. """ if self.cols is None: raise ValueError("No cols set. Transformer not fitted?") in_cols = df_in.columns df_out = self._transform(df_in) df_in = df_in.drop(self.cols_in_out, axis=1) # Join input columns to output df_joined = df_in.join(df_out) # Reorder output columns and return. # This cannot be precomputed during fit because we want to support # any extra column not present during fit time. We also want to # maintain the input column order. # Ex: we can fit a df that contains a target column and transform dfs # without that target. cols_out = self._get_ordered_out_cols(in_cols, self.cols, self.col_map_1_n, self.keep_original) return df_joined[cols_out]
def _get_ordered_out_cols(self, cols_in_all, cols_in, col_map_1_n, keep_original=False): cols_out = [] for i in cols_in_all: if i in cols_in: if keep_original: cols_out.append(i) cols_out += col_map_1_n[i] else: cols_out.append(i) # Remove duplicates. cols_out = list(collections.OrderedDict.fromkeys(cols_out)) return cols_out
[docs] def inverse_transform(self, df_in): in_cols = df_in.columns if self.keep_original and all(i in in_cols for i in self.cols): # If keep original and all the original columns are present in # df_in, we save computation removing output columns and returning # original columns. df_in = df_in.drop(self.cols_out, axis=1, errors="ignore") return df_in df_out = self._inverse_transform(df_in) df_in = df_in.drop(self.cols_in_out, axis=1) if self.keep_original: # If we keep original cols, the best will be to just drop the # transformed columns (look at the first if of this method), but I # want it to work even without those columns. df_in = df_in.drop(df_out.columns, axis=1, errors="ignore") df_joined = df_in.join(df_out) cols_out = self._get_ordered_out_cols(in_cols, self.cols_out, self.col_map_1_n_inverse) return df_joined[cols_out]
[docs] def get_column_mapping(self): """Get the column mapping between the input and transformed DataFrame. By default it returns a 1:1 map between the input and output columns. Make sure your transformer is fitted before calling this function. Return: A dict in which the keys are the input DataFrame column names and the value is the output DataFrame column names. Both key and values can be tuples, tuple:1 useful to indicate that one output column has been created from a list of columns from the input DataFrame, 1:tuple useful to indicate that a list of output columns come from one specific column of the input DataFrame. We use tuples and not lists because lists are not hashable, so they cannot be keys in a dict. See Also: :obj:`recipipe.core.RecipipeTransformer.col_format` Raise: :obj:`ValueError` if `self.cols` is `None`. """ if self.cols is None: raise ValueError("No columns. Transformer not fitted?") return {i: self.col_format.format(i) for i in self.cols}