Source code for recipipe.core


import collections
import inspect

import sklearn.pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

from recipipe.utils import add_to_map_dict
from recipipe.utils import flatten_list
from recipipe.utils import fit_columns


[docs]class Recipipe(sklearn.pipeline.Pipeline):
    """Recipipe pipeline.

    A Recipipe pipeline is an extension of an SKLearn pipeline.
    It adds some functionality that make the creation of pipelines less
    painful.
    For example, the `steps` param is not required at the construction time.
    You can add your transformers to the pipeline anytime using
    :obj:`recipipe.core.Recipipe.add`.

    Attr:
        Same attributes as :obj:`sklearn.pipeline.Pipeline`.
    """

[docs]    def __init__(self, steps=None, **kwargs):
        """Create a Recipipe pipeline.

        Args:
            steps (:obj:`list`): Same as in :obj:`sklearn.pipeline.Pipeline`.
            kwargs: Same as in :obj:`sklearn.pipeline.Pipeline`: `memory`
                and `verbose`.
        """

        self.steps = []

        if steps:
            for i in steps:
                self.add(i)
        else:
            # Mock validate steps to avoid empty list validation.
            # This method depends a lot of the hidden representation of the
            # Pipeline class but it's better than using a dummy empty
            # transformer.
            # An empty transformer adds more complexity to make methods like
            # __len__ work properly. Also it's impossible to control the use
            # of self.steps from outside of the class.
            aux = self._validate_steps
            self._validate_steps = lambda: True

        super().__init__(self.steps, **kwargs)

        # Unmock validation method if needed.
        if not steps:
            self._validate_steps = aux

[docs]    def __add__(self, transformer):
        """Add a new step to the pipeline using the '+' operator.

        Note that this is exactly the same as calling
        :obj:`recipipe.core.Recipipe.add`.
        The Recipipe object is going to be modified, that is `p = p + t` is
        the same as `p + t`, where `p` is any Recipipe pipeline and `t` is any
        transformer.

        See Also:
            :obj:`recipipe.core.Recipipe.add`
        """

        return self.add(transformer)

[docs]    def add(self, step):
        """Add a new step to the pipeline.

        You can add steps even if the pipeline is already fitted, so be
        careful.

        Args:
            step (Transformer or tuple(`str`, Transformer)): The new step that
                you want to add to the pipeline.
                Any transformer is good (SKLearn transformer or
                :obj:`recipipe.core.RecipipeTransformer`).
                If a tuple is given, the fist element of the tuple is going to
                be used as a name for the step in the pipeline.

        Returns:
            The pipeline.
            You can chain `add` methods: `pipe.add(...).add(...)...`.

        See Also:
            :obj:`recipipe.core.Recipipe.__add__`
        """

        if type(step) is not tuple:
            transformer = step
            if getattr(transformer, "name", None):
                name = transformer.name
            else:
                idx = len(self.steps)
                name = f"step{idx:02d}"
            step = (name, transformer)

        self.steps.append(step)

        return self


"""
The next lines are really nasty, but they avoid an error with:

    from sklearn import set_config

    set_config(display='diagram')
    <any-recipipe-transformer-here>

executed in a notebook.
The reason to overwrite an SKLearn function is because we have params that do
not need to be used as __init__ params.
That function avoids showing all the params of an estimator, only shows those
params with a non-default value.
Overwriting it will print extra information, but nothing bad (I suppose :).
"""
import sklearn.utils._pprint
sklearn.utils._pprint._changed_params = lambda e: e.get_params(deep=False)


[docs]class RecipipeTransformer(BaseEstimator, TransformerMixin):
    """Base class of all Recipipe transformers.

    Attributes:
        name (str): Human-friendly name of the transformer.
    """

    @classmethod
    def _get_param_names(cls):
        """Get parameter names for the estimator.

        Taken from SKLearn with some extra modifications to allow the use of
        var positional arguments (*args) in estimators.
        My modifications are indicated by comments with two ##.
        """

        parent_params = []
        if cls.__name__ != RecipipeTransformer.__name__:
            for i in cls.__bases__:
                parent_params += i._get_param_names()
        # fetch the constructor or the original constructor before
        # deprecation wrapping if any
        init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
        if init is object.__init__:  # pragma: no cover
            ## I don't know how to force this situation for testing...
            # No explicit constructor to introspect
            return []

        # introspect the constructor arguments to find the model parameters
        # to represent
        init_signature = inspect.signature(init)
        # Consider the constructor parameters excluding 'self'
        ## Taking p.name instead of p.
        parameters = [p.name for p in init_signature.parameters.values()
                      if p.name != 'self' and p.kind != p.VAR_KEYWORD
                                          ## No positional vars (*args).
                                          and p.kind != p.VAR_POSITIONAL]
        ## No RuntimeError raised here!
        ## Adding super params.
        parameters += parent_params
        parameters = set(parameters)
        # Extract and sort argument names excluding 'self'
        return sorted(parameters)

[docs]    def __init__(self, *args, cols_init=None, exclude=None, dtype=None,
                 name=None, keep_original=False, col_format="{}",
                 cols_not_found_error=False):
        """Create a new transformer.

        Columns names can be use Unix filename pattern matching (
        :obj:`fnmatch`).

        Args:
            *args (:obj:`list` of :obj:`str`): List of columns the transformer
                will work on.
            cols_init (:obj:`list` of :obj:`str`): List of columns the
                transformer will work on. If `*args` are provided, this list
                of columns is going to be appended at the end.
            exclude (:obj:`list` of :obj:`str`): List of columns to exclude.
                The exclusion is applied after fitting the columns, so it can
                be used at the same time as `*args` and `col_init`.
            dtype (:obj:`numpy.dtype`, :obj:`str`, :obj:`list` of
                :obj:`numpy.dtype` or with :obj:`str` or :obj:`dict`): This
                value is passed to :obj:`pandas.DataFrame.select_dtypes`.
                If a :obj:`dict` is given, the Pandas function is going to be
                called with dictionary unpacking: `select_dtypes(**dtype)`.
                In this way you can exclude, for example, int dtypes using:
                `dtype=dict(exclude=int)`.
                The columns returned by this method (executed in the DataFrame
                passed to the fit method) will be the columns that are going
                to be used in the transformation phase.
                When used in combination with `*args` or `cols_init`, the dtype
                filter is applied later.
            name (:obj:`str`): Human-friendly name of the transformer.
            keep_original (:obj:`bool`): `True` if you want to keep the input
                columns used in the transformer in the transformed DataFrame,
                `False` if not.
                Note that, if the output column has the same name as the input
                column, the output input column will not be included even if
                `keep_original` is set to `True`.
                Default: `False`.
            col_format (:obj:`str`): New name of the columns. Use "{}" in to
                substitute that placeholder by the column name. For example, if
                you want to append the string "_new" at the end of all the
                generated columns you must set `col_format="{}_new"`.
                Default: "{}".
            cols_not_found_error (:obj:`bool`): Raise an error if the isn't
                any match for any of the specified columns.
                Default: `False`.
        """

        if cols_init:
            args = (args, cols_init)
        cols_init = flatten_list(args)

        # Set values.
        self.cols_init = cols_init
        self.exclude = flatten_list(exclude or [])
        self.dtype = dtype
        self.keep_original = keep_original
        self.name = name
        self.col_format = col_format
        self.col_map = None  # set in fit
        self.cols = None  # fitted columns, set in fit
        self.cols_out = None  # set in fit
        self.cols_not_found_error = cols_not_found_error
        self.cols_in_out = None  # set in fit

    def _fit(self, df):  # pragma: no cover
        """Your fit code should be here.

        Args:
            df (:obj:`pandas.DataFrame`): DataFrame used for fitting.
        """

        pass

    def _transform(self, df):  # pragma: no cover
        """Your transform code should be here.

        Abstract method that you should overwrite in your classes.
        Remember that any transformation done here should be consistent with
        the column mapping returned by
        :obj:`recipipe.core.RecipipeTransformer.get_column_mapping`.
        Ex: if your column mapping is `{"c1": "c1"}` do not return in this
        method a DataFrame with columns `c1` and `c2`.

        Args:
            df (:obj:`pandas.DataFrame`): DataFrame to transform.

        Return:
            The transformer DataFrame.
        """

        return df

    def _inverse_transform(self, df):  # pragma: no cover
        return df

[docs]    def fit(self, df, y=None):
        """Fit the transformer.

        Args:
            df (pandas.DataFrame): Dataframe used to fit the transformation.
        """

        cols = fit_columns(df, self.cols_init, self.dtype,
                self.cols_not_found_error)
        exclude = fit_columns(df, self.exclude, None,
                self.cols_not_found_error) if self.exclude else []
        self.cols = [i for i in cols if i not in exclude]
        self._fit(df)
        # Save column maps and lists.
        self.col_map = self.get_column_mapping()
        # Recreate cols, just in case you overwrite the get_column_mapping
        # but you didn't specify any cols_init.
        self.cols = list(collections.OrderedDict.fromkeys(
            flatten_list(self.col_map.keys())))
        col_map_1_n, col_map_1_n_inverse = {}, {}
        for k, v in self.col_map.items():
            add_to_map_dict(col_map_1_n, k, v)
            add_to_map_dict(col_map_1_n_inverse, v, k)
        self.col_map_1_n = col_map_1_n
        self.col_map_1_n_inverse = col_map_1_n_inverse
        self.cols_out = list(collections.OrderedDict.fromkeys(
            flatten_list(self.col_map.values())))
        # Cols in input columns and output columns should be removed from
        # df_in during the transform phase.
        # We join df_in with df_out, so we do not want duplicate column names.
        self.cols_in_out = set(self.cols).intersection(set(self.cols_out))

        if self.keep_original and self.cols_in_out:
            raise ValueError("Rename the output columns if you want to keep "
                             "the original columns, name collisions in "
                             f"{self.cols_in_out}")

        return self

[docs]    def transform(self, df_in):
        """Transform DataFrame.

        Args:
            df_in (:obj:`pandas.DataFrame`): Input DataFrame.

        Returns:
            Transformed DataFrame.

        Raise:
            :obj:`ValueError` if not `cols` fitted. Fit the transform to avoid
            this error.
        """

        if self.cols is None:
            raise ValueError("No cols set. Transformer not fitted?")

        in_cols = df_in.columns
        df_out = self._transform(df_in)
        df_in = df_in.drop(self.cols_in_out, axis=1)
        # Join input columns to output
        df_joined = df_in.join(df_out)

        # Reorder output columns and return.
        # This cannot be precomputed during fit because we want to support
        # any extra column not present during fit time. We also want to
        # maintain the input column order.
        # Ex: we can fit a df that contains a target column and transform dfs
        # without that target.
        cols_out = self._get_ordered_out_cols(in_cols, self.cols,
                self.col_map_1_n, self.keep_original)

        return df_joined[cols_out]

    def _get_ordered_out_cols(self, cols_in_all, cols_in, col_map_1_n,
            keep_original=False):
        cols_out = []
        for i in cols_in_all:
            if i in cols_in:
                if keep_original:
                    cols_out.append(i)
                cols_out += col_map_1_n[i]
            else:
                cols_out.append(i)
        # Remove duplicates.
        cols_out = list(collections.OrderedDict.fromkeys(cols_out))
        return cols_out

[docs]    def inverse_transform(self, df_in):
        in_cols = df_in.columns

        if self.keep_original and all(i in in_cols for i in self.cols):
            # If keep original and all the original columns are present in
            # df_in, we save computation removing output columns and returning
            # original columns.
            df_in = df_in.drop(self.cols_out, axis=1, errors="ignore")
            return df_in

        df_out = self._inverse_transform(df_in)
        df_in = df_in.drop(self.cols_in_out, axis=1)
        if self.keep_original:
            # If we keep original cols, the best will be to just drop the
            # transformed columns (look at the first if of this method), but I
            # want it to work even without those columns.
            df_in = df_in.drop(df_out.columns, axis=1, errors="ignore")
        df_joined = df_in.join(df_out)

        cols_out = self._get_ordered_out_cols(in_cols, self.cols_out,
                self.col_map_1_n_inverse)

        return df_joined[cols_out]

[docs]    def get_column_mapping(self):
        """Get the column mapping between the input and transformed DataFrame.

        By default it returns a 1:1 map between the input and output columns.
        Make sure your transformer is fitted before calling this function.

        Return:
            A dict in which the keys are the input DataFrame column names and
            the value is the output DataFrame column names.
            Both key and values can be tuples, tuple:1 useful to indicate that
            one output column has been created from a list of columns from the
            input DataFrame, 1:tuple useful to indicate that a list of output
            columns come from one specific column of the input DataFrame.
            We use tuples and not lists because lists are not hashable, so they
            cannot be keys in a dict.

        See Also:
            :obj:`recipipe.core.RecipipeTransformer.col_format`

        Raise:
            :obj:`ValueError` if `self.cols` is `None`.
        """

        if self.cols is None:
            raise ValueError("No columns. Transformer not fitted?")

        return {i: self.col_format.format(i) for i in self.cols}