diff --git a/.gitignore b/.gitignore index ba74660..de69825 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,6 @@ docs/_build/ # PyBuilder target/ + +# Editors +.vscode diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..3c04e40 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,13 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +sphinx: + configuration: ./conf.py + +python: + install: + - requirements: requirements.txt \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9e1ed7d --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Scikit-learnenhancementproposals +SOURCEDIR = . +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/README.rst b/README.rst index 3bf8cd8..6028cd2 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,6 @@ -===================================== +================================== Scikit-learn enhancement proposals -===================================== +================================== This repository is for structured discussions about large modifications or additions to scikit-learn. @@ -12,4 +12,5 @@ possible solution. It should be a summary of the key points that drive the decision, and ideally converge to a draft of an API or object to be implemented in scikit-learn. - +The SLEPs are publicly available online on `Read The Docs +`_. \ No newline at end of file diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..c741881 --- /dev/null +++ b/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-slate \ No newline at end of file diff --git a/conf.py b/conf.py new file mode 100644 index 0000000..d75b601 --- /dev/null +++ b/conf.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'Scikit-learn enhancement proposals' +copyright = '2018, scikit-learn community' +author = 'scikit-learn community' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx_issues', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [] + +default_role = 'literal' + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Scikit-learnenhancementproposalsdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Scikit-learnenhancementproposals.tex', 'Scikit-learn enhancement proposals Documentation', + 'scikit-learn community', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'scikit-learnenhancementproposals', 'Scikit-learn enhancement proposals Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Scikit-learnenhancementproposals', 'Scikit-learn enhancement proposals Documentation', + author, 'Scikit-learnenhancementproposals', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +intersphinx_mapping = {'sklearn': ('http://scikit-learn.org/stable', None)} + +# -- Sphinx-Issues configuration -- + +issues_github_path = "scikit-learn/scikit-learn" diff --git a/index.rst b/index.rst new file mode 100644 index 0000000..9848922 --- /dev/null +++ b/index.rst @@ -0,0 +1,51 @@ +.. Scikit-learn enhancement proposals documentation master file, created by + sphinx-quickstart on Wed Dec 12 10:57:18 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. include:: README.rst + +.. toctree:: + :maxdepth: 1 + :caption: Accepted + + slep000/proposal + slep006/proposal + slep007/proposal + slep009/proposal + slep010/proposal + slep017/proposal + slep018/proposal + slep020/proposal + +.. toctree:: + :maxdepth: 1 + :caption: Under review + + slep017/proposal + slep019/proposal + +.. toctree:: + :maxdepth: 1 + :numbered: + :caption: Delayed review + + slep001/proposal + slep002/proposal + slep003/proposal + +.. toctree:: + :maxdepth: 1 + :caption: Rejected + + slep004/proposal + slep012/proposal + slep013/proposal + slep014/proposal + slep015/proposal + +.. toctree:: + :maxdepth: 1 + :caption: Template + + slep_template diff --git a/make.bat b/make.bat new file mode 100644 index 0000000..55e4c0e --- /dev/null +++ b/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=build +set SPHINXPROJ=Scikit-learnenhancementproposals + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5666abb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +sphinx +sphinx-rtd-theme +sphinx-issues diff --git a/slep000/pep-0001-process_flow.png b/slep000/pep-0001-process_flow.png new file mode 100644 index 0000000..0fc8176 Binary files /dev/null and b/slep000/pep-0001-process_flow.png differ diff --git a/slep000/proposal.rst b/slep000/proposal.rst new file mode 100644 index 0000000..7c7aeda --- /dev/null +++ b/slep000/proposal.rst @@ -0,0 +1,298 @@ +.. _slep_000: + +============================== +SLEP000: SLEP and its workflow +============================== + +:Author: Adrin Jalali +:Status: Accepted +:Type: Process +:Created: 2020-02-13 + +Abstract +######## + +This SLEP specifies details related to SLEP submission, review, and acceptance +process. + +Motivation +########## + +Without a predefined workflow, the discussions around a SLEP can be long and +consume a lot of energy for both the author(s) and the reviewers. The lack of a +known workflow also results in the SLEPs to take months (if not years) before +it is merged as ``Under Review``. The purpose of this SLEP is to lubricate and +ease the process of working on a SLEP, and make it a more enjoyable and +productive experience. This SLEP borrows the process used in PEPs and NEPs +which means there will be no ``Under Review`` status. + + +What is a SLEP? +############### + +SLEP stands for Scikit-Learn Enhancement Proposal, inspired from Python PEPs or +Numpy NEPs. A SLEP is a design document providing information to the +scikit-learn community, or describing a new feature for scikit-learn or its +processes or environment. The SLEP should provide a concise technical +specification of the proposed solution, and a rationale for the feature. + +We intend SLEPs to be the primary mechanisms for proposing major new features, +for collecting community input on an issue, and for documenting the design +decisions that have gone into scikit-learn. The SLEP author is responsible for +building consensus within the community and documenting dissenting opinions. + +Because the SLEPs are maintained as text files in a versioned repository, their +revision history is the historical record of the feature proposal. + +SLEP Audience +############# + +The typical primary audience for SLEPs are the core developers of +``scikit-learn`` and technical committee, as well as contributors to the +project. However, these documents also serve the purpose of documenting the +changes and decisions to help users understand the changes and why they are +made. The SLEPs are available under `Scikit-learn enhancement proposals +`_. + +SLEP Types +########## + +There are three kinds of SLEPs: + +1. A Standards Track SLEP describes a new feature or implementation for +scikit-learn. + +2. An Informational SLEP describes a scikit-learn design issue, or provides +general guidelines or information to the scikit-learn community, but does not +propose a new feature. Informational SLEPs do not necessarily represent a +scikit-learn community consensus or recommendation, so users and implementers +are free to ignore Informational SLEPs or follow their advice. For instance, an +informational SLEP could be one explaining how people can write a third party +estimator, one to explain the usual process of adding a package to the contrib +org, or what our inclusion criteria are for scikit-learn and +scikit-learn-extra. + +3. A Process SLEP describes a process surrounding scikit-learn, or proposes a +change to (or an event in) a process. Process SLEPs are like Standards Track +SLEPs but apply to areas other than the scikit-learn library itself. They may +propose an implementation, but not to scikit-learn’s codebase; they require +community consensus. Examples include procedures, guidelines, changes to the +decision-making process and the governance document, and changes to the tools +or environment used in scikit-learn development. Any meta-SLEP is also +considered a Process SLEP. + + +SLEP Workflow +############# + +A SLEP starts with an idea, which usually is discussed in an issue or a pull +request on the main repo before submitting a SLEP. It is generally a good idea +for the author of the SLEP to gauge the viability and the interest of the +community before working on a SLEP, mostly to save author's time. + +A SLEP must have one or more champions: people who write the SLEP following the +SLEP template, shepherd the discussions around it, and seek consensus in the +community. + +The proposal should be submitted as a draft SLEP via a GitHub pull request to a +``slepXXX`` directory with the name ``proposal.rst`` where ``XXX`` is an +appropriately assigned three-digit number (e.g., ``slep000/proposal.rst``). The +draft must use the `SLEP — Template and Instructions +`_ +file. + +Once the PR for the SLEP is created, a post should be made to the mailing list +containing the sections up to “Backward compatibility”, with the purpose of +limiting discussion there to usage and impact. Discussion on the pull request +will have a broader scope, also including details of implementation. + +The first draft of the SLEP needs to be approved by at least one core developer +before being merged. Merging the draft does not mean it is accepted or is ready +for the vote. To this end, the SLEP draft is reviewed for structure, +formatting, and other errors. Approval criteria are: + +- The draft is sound and complete. The ideas must make technical sense. +- The initial PR reviewer(s) should not consider whether the SLEP seems likely + to be accepted. +- The title of the SLEP draft accurately describes its content. + +Reviewers are generally quite lenient about this initial review, expecting that +problems will be corrected by the further reviewing process. **Note**: Approval +of the SLEP draft is no guarantee that there are no embarrassing mistakes! +Ideally they're avoided, but they can also be fixed later in separate PRs. Once +approved by at least one core developer, the SLEP draft can be merged. +Additional PRs may be made by the champions to update or expand the SLEP, or by +maintainers to set its status, discussion URL, etc. + +Standards Track SLEPs (see bellow) consist of two parts, a design document and +a reference implementation. It is generally recommended that at least a +prototype implementation be co-developed with the SLEP, as ideas that sound +good in principle sometimes turn out to be impractical when subjected to the +test of implementation. Often it makes sense for the prototype implementation +to be made available as PR to the scikit-learn repo (making sure to +appropriately mark the PR as a WIP). + +Review and Resolution +--------------------- + +SLEPs are discussed on the mailing list or the PRs modifying the SLEP. The +possible paths of the status of SLEPs are as follows: + +.. image:: pep-0001-process_flow.png + :alt: SLEP process flow diagram + +All SLEPs should be created with the ``Draft`` status. + +Eventually, after discussion, there may be a consensus that the SLEP should be +accepted – see the next section for details. At this point the status becomes +``Accepted``. + +Once a SLEP has been ``Accepted``, the reference implementation must be +completed. When the reference implementation is complete and incorporated into +the main source code repository, the status will be changed to ``Final``. Since +most SLEPs deal with a part of scikit-learn's API, another way of viewing a +SLEP as ``Final`` is when its corresponding API interface is considered stable. + +To allow gathering of additional design and interface feedback before +committing to long term stability for a feature or API, a SLEP may also be +marked as ``Provisional``. This is short for "Provisionally Accepted", and +indicates that the proposal has been accepted for inclusion in the reference +implementation, but additional user feedback is needed before the full design +can be considered ``Final``. Unlike regular accepted SLEPs, provisionally +accepted SLEPs may still be ``Rejected`` or ``Withdrawn`` even after the +related changes have been included in a scikit-learn release. + +Wherever possible, it is considered preferable to reduce the scope of a +proposal to avoid the need to rely on the ``Provisional`` status (e.g. by +deferring some features to later SLEPs), as this status can lead to version +compatibility challenges in the wider scikit-learn ecosystem. + +A SLEP can also be assigned status ``Deferred``. The SLEP author or a core +developer can assign the SLEP this status when no progress is being made on the +SLEP. + +A SLEP can also be ``Rejected``. Perhaps after all is said and done it was not +a good idea. It is still important to have a record of this fact. The +``Withdrawn`` status is similar; it means that the SLEP author themselves has +decided that the SLEP is actually a bad idea, or has accepted that a competing +proposal is a better alternative. + +When a SLEP is ``Accepted``, ``Rejected``, or ``Withdrawn``, the SLEP should be +updated accordingly. In addition to updating the status field, at the very +least the ``Resolution`` header should be added with a link to the relevant +thread in the mailing list archives or where the discussion happened. + +SLEPs can also be ``Superseded`` by a different SLEP, rendering the original +obsolete. The ``Replaced-By`` and ``Replaces`` headers should be added to the +original and new SLEPs respectively. + +``Process`` SLEPs may also have a status of ``Active`` if they are never meant +to be completed, e.g. SLEP 1 (this SLEP). + +How a SLEP becomes Accepted +--------------------------- + +A SLEP is ``Accepted`` by the voting mechanism defined in the `governance model +`_. We +need a concrete way to tell whether consensus has been reached. When you think +a SLEP is ready to accept, create a PR changing the status of the SLEP to +``Accepted``, then send an email to the scikit-learn mailing list with a +subject like: + + [VOTE] Proposal to accept SLEP #: + +In the body of your email, you should: + +- link to the latest version of the SLEP, and a link to the PR accepting the + SLEP. + +- briefly describe any major points of contention and how they were resolved, + +- include a sentence like: “The vote will be closed in a month i.e. on + <the_date>.” + +Generally the SLEP author will be the one to send this email, but anyone can do +it; the important thing is to make sure that everyone knows when a SLEP is on +the verge of acceptance, and give them a final chance to respond. + +In general, the goal is to make sure that the community has consensus, not +provide a rigid policy for people to try to game. When in doubt, err on the +side of asking for more feedback and looking for opportunities to compromise. + +If the final comment and voting period passes with the required majority, then +the SLEP can officially be marked ``Accepted``. The ``Resolution`` header +should link to the PR accepting the SLEP. + +If the vote does not achieve a required majority, then the SLEP remains in +``Draft`` state, discussion continues as normal, and it can be proposed for +acceptance again later once the objections are resolved. + +In unusual cases, with the request of the author, the scikit-learn technical +committee may be asked to decide whether a controversial SLEP is ``Accepted``, +put back to ``Draft`` with additional recommendation to try again to reach +consensus or definitely ``Rejected``. Please refer to the governance doc for +more details. + +Maintenance +----------- + +In general, Standards track SLEPs are no longer modified after they have +reached the ``Final`` state as the code and project documentation are +considered the ultimate reference for the implemented feature. However, +finalized Standards track SLEPs may be updated as needed. + +Process SLEPs may be updated over time to reflect changes to development +practices and other details. The precise process followed in these cases will +depend on the nature and purpose of the SLEP being updated. + +Format and Template +------------------- + +SLEPs are UTF-8 encoded text files using the `reStructuredText +<http://docutils.sourceforge.net/rst.html>`_ format. Please see the `SLEP — +Template and Instructions +<https://github.com/scikit-learn/enhancement_proposals/blob/master/slep_template.rst>`_ +file and the `reStructuredTextPrimer +<https://www.sphinx-doc.org/en/stable/rest.html>`_ for more information. We use +`Sphinx <https://www.sphinx-doc.org/en/stable/>`_ to convert SLEPs to HTML for +viewing on the web. + +Header Preamble +--------------- + +Each SLEP must begin with a header preamble. The headers must appear in the +following order. Headers marked with * are optional. All other headers are +required:: + + :Author: <list of authors' real names and optionally, email addresses> + :Status: <Draft | Active | Accepted | Deferred | Rejected | + Withdrawn | Final | Superseded> + :Type: <Standards Track | Informational | Process> + :Created: <date created on, in yyyy-mm-dd format> + * :Requires: <slep numbers> + * :scikit-learn-Version: <version number> + * :Replaces: <slep number> + * :Replaced-By: <slep number> + * :Resolution: <url> + +The Author header lists the names, and optionally the email addresses of all +the authors of the SLEP. The format of the Author header value must be + + Random J. User <address@dom.ain> + +if the email address is included, and just + + Random J. User + +if the address is not given. If there are multiple authors, each should be on a +separate line. + +Copyright +--------- + +This document has been placed in the public domain [1]_. + +References and Footnotes +------------------------ + +.. [1] Open Publication License: https://www.opencontent.org/openpub/ diff --git a/slep001/example_outlier_digits.py b/slep001/example_outlier_digits.py new file mode 100644 index 0000000..8b308ba --- /dev/null +++ b/slep001/example_outlier_digits.py @@ -0,0 +1,69 @@ +""" +Small example doing data filtering on digits for t-SNE embedding. +""" +from time import time + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn import manifold, datasets, decomposition, pipeline + +from outlier_filtering import EllipticEnvelopeFilter +from subsampler import SubSampler + +digits = datasets.load_digits() +X = digits.data +y = digits.target +n_samples, n_features = X.shape + + +#---------------------------------------------------------------------- +# Scale and visualize the embedding vectors +def plot_embedding(X, y, title=None): + x_min, x_max = np.min(X, 0), np.max(X, 0) + X = (X - x_min) / (x_max - x_min) + + plt.figure() + plt.subplot(111) + for this_x, this_y in zip(X, y): + plt.text(this_x[0], this_x[1], str(this_y), + color=plt.cm.Set1(this_y / 10.), + fontdict={'weight': 'bold', 'size': 9}) + + plt.xticks([]), plt.yticks([]) + if title is not None: + plt.title(title) + + +print("Computing t-SNE embedding") + +tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) + +subsampler = SubSampler(random_state=1, ratio=.5) + +filtering = EllipticEnvelopeFilter(random_state=1) + +t0 = time() + +# We need a PCA reduction of X because MinCovDet crashes elsewhere +X_pca = decomposition.RandomizedPCA(n_components=30).fit_transform(X) +filtering.fit_pipe(*subsampler.transform_pipe(X_pca)) + +print("Fitting filtering done: %.2fs" % (time() - t0)) + +X_red, y_red = filtering.transform_pipe(X_pca, y) + +X_tsne = tsne.fit_transform(X_red) + +plot_embedding(X_tsne, y_red, + "With outlier_filtering") + + +# Now without outlier_filtering +X_tsne = tsne.fit_transform(X_pca) + +plot_embedding(X_tsne, y, + "Without outlier_filtering") + +plt.show() + diff --git a/slep001/outlier_filtering.py b/slep001/outlier_filtering.py new file mode 100644 index 0000000..9e20db4 --- /dev/null +++ b/slep001/outlier_filtering.py @@ -0,0 +1,32 @@ +from sklearn.base import BaseEstimator +from sklearn.covariance import EllipticEnvelope + + +class EllipticEnvelopeFilter(BaseEstimator): + + def __init__(self, assume_centered=False, + support_fraction=None, contamination=0.1, + random_state=None): + self.assume_centered = assume_centered + self.support_fraction = support_fraction + self.contamination = contamination + self.random_state = random_state + + def fit_pipe(self, X, y=None): + self.elliptic_envelope_ = EllipticEnvelope(**self.get_params()) + self.elliptic_envelope_.fit(X) + return self.transform_pipe(X, y) + + def transform_pipe(self, X, y): + # XXX: sample_props not taken care off + is_inlier = self.elliptic_envelope_.predict(X) == 1 + X_out = X[is_inlier] + if y is None: + y_out = None + else: + y_out = y[is_inlier] + return X_out, y_out + + def transform(self, X, y=None): + return X + diff --git a/slep001/proposal.rst b/slep001/proposal.rst new file mode 100644 index 0000000..a335b5f --- /dev/null +++ b/slep001/proposal.rst @@ -0,0 +1,383 @@ +.. _slep_001: + +============================================== +SLEP001: Transformers that modify their target +============================================== + +.. topic:: **Summary** + + Transformers implement:: + + self = estimator.fit(X, y=None) + X_transform = estimator.transform(X) + estimator.fit(X, y=None).transform(X) == estimator.fit_transform(X, y) + + Within a chain or processing sequence of estimators, many usecases + require modifying y. How do we support this? + + Doing many of these things is possible "by hand". The question is: + how to avoid writing custom connecting logic. + +.. contents:: Table of contents + :depth: 2 + +Rational +======== + +Summary of the contract of transformers +---------------------------------------- + +* .transform(...) returns a data matrix X + +* .transform(...) returns one feature vector for each sample of the input + +* .fit_transform(...) is the same and .fit(...).transform(...) + +Examples of usecases targetted +------------------------------- + +#. Over sampling: + + #. Class rembalancing: over sampling the minority class in + unbalanced dataset + #. Data enhancement (nudgging images for instance) + +#. Under-sampling + + #. Stateless undersampling: Take one sample out of two + #. Stateful undersampling: apply clustering and transform to cluster + centers + #. Coresets: return a smaller number of samples and associated sample + weights + +#. Outlier detection: + + #. Remove outlier from train set + #. Create a special class 'y' for outliers + +#. Completing y: + + #. Missing data imputation on y + #. Semi-supervised learning (related to above) + +#. Data loading / conversion + + #. Pandas in => (X, y) out + #. Images in => patches out + #. Filename in => (X, y) with multiple samples (very useful in + combination with online learning) + #. Database query => (X, y) out + +#. Aggregate statistics over multiple samples + + #. Windowing-like functions on time-series + + In a sense, these are dodgy with scikit-learn's cross-validation API + that knows nothing about sample structure. But the refactor of the CV + API is really helping in this regard. + +____ + +These usecases pretty much require breaking the contract of the +Transformer, as detailed above. + +The intuition driving this enhancement proposal is that the more the +data-processing pipeline becomes rich, the more the data grow, the more +the usecases above become important. + +Enhancements proposed +===================== + +Option A: meta-estimators +------------------------- + +Proposal +........ + +This option advocates that any transformer-like usecase that wants to +modify y or the number of samples should not be a transformer-like but a +specific meta-estimator. A core-set object would thus look like: + +* From the user perspective:: + + from sklearn.sample_shrink import BirchCoreSet + from sklearn.ensemble import RandomForest + estimator = BirchCoreSet(RandomForest()) + +* From the developer perspective:: + + class BirchCoreSet(BaseEstimator): + + def fit(self, X, y): + # The logic here is wrong, as we need to handle y: + super(BirchCoreSet, self).fit(X) + X_red = self.subcluster_centers_ + self.estimator_.fit(X_red) + +Benefits +......... + +#. No change to the existing API + +#. The meta-estimator pattern is very powerful, and pretty much anything + is possible. + +Limitations +............ + +The different limitations listed below are variants of the same +conceptual difficulty + +#. It is hard to have mental models and garantees of what a + meta-estimator does, as it is by definition super versatile + + This is both a problem for the beginner, that needs to learn them on + an almost case-by-case basis, and for the advanced user, that needs to + maintain a set of case-specific code + +#. The "estimator heap" problem. + + Here the word heap is used to denote the multiple pipelines and + meta-estimators. It corresponds to what we would naturally call a + "data processing pipeline", but we use "heap" to avoid confusion with + the pipeline object. + + Heaps combining many steps of pipelines and meta-estimators become + very hard to inspect and manipulate, both for the user, and for + pipeline-management (aka "heap-management") code. Currently, these + difficulties are mostly in user code, so we don't see them too much in + scikit-learn. Here are concrete examples + + #. Trying to retrieve coefficients from a model estimated in a + "heap". Eg: + + * you know there is a lasso in your stack and you want to + get it's coef (in whatever space that resides?): + ``pipeline.named_steps['lasso'].coef_`` is possible. + + * you want to retrieve the coef of the last step: + ``pipeline.steps[-1][1].coef_`` is possible. + + With meta estimators this is tricky. + Solving this problem requires + https://github.com/scikit-learn/scikit-learn/issues/2562#issuecomment-27543186 + (this enhancement proposal is not advocating to solve the problem + above, but pointing it out as an illustration) + + #. DaskLearn has modified the logic of pipeline to expose it as a + computation graph. The reason that it was relatively easy to do is + that there was mostly one object to modify to do the dispatching, + the Pipeline object. + + #. A future, out-of-core "conductor" object to fit a "heap" in out of + core by connecting it to a data-store would need to have a + representation of the heap. For instance, when chaining random + projections with Birch coresets and finally SGD, the user would + need to specify that random projections are stateless, birch needs + to do one pass of the data, and SGD a few. Given this information, + the conductor could orchestrate pull the data from the data source, + and sending it to the various steps. Such an object is much harder + to implement if the various steps are to be combined in a heap. + Note that the scikit-learn pipeline can only implement a linear + "chain" like set of processing. For instance a One vs All will + never be able to be implemented in a scikit-learn pipeline. + + This is not a problem in non out-of-core settings, in the sense + that the BirchCoreSet meta-estimator would take care of doing a + pass on the data before feeding it to its sub estimator. + +In conclusion, meta-estimators are harder to comprehend (problem 1) and +write (problem 2). + +That said, we will never get rid of meta estimators. It is a very +powerful pattern. The discussion here is about extending a bit the +estimator API to have a less pressing need for meta-estimators. + +Option B: transformer-like that modify y +---------------------------------------- + +.. topic:: **Two variants** + + 1. Changing the semantics of transformers to modify y and return + something more complex than a data matrix X + + 2. Introducing new methods (and a new type of object) + + There is an emerging consensus for option 2. + +.. topic:: **``transform`` modifying y** + + Variant 1 above could be implementing by allowing transform to modify + y. However, the return signature of transform would be unclear. + + Do we modify all transformers to return a y (y=None for unsupervised + transformers that are not given y?). This sounds like leading to code + full of surprises and difficult to maintain from the user perspective. + + We would loose the contract that the number of samples is unchanged by + a transformer. This contract is very useful (eg for model selection: + measuring error for each sample). + + For these reasons, we feel new methods are necessary. + +Proposal +......... + +Introduce a ``TransModifier`` type of object with the following API +(names are discussed below): + +* ``X_new, y_new = estimator.fit_modify(X, y)`` + +* ``X_new, y_new = estimator.trans_modify(X, y)`` + +Or: + +* ``X_new, y_new, sample_props = estimator.fit_modify(X, y)`` + +* ``X_new, y_new, sample_props = estimator.trans_modify(X, y)`` + +Contracts (these are weaker contracts than the transformer: + +* Neither ``fit_modify`` nor ``trans_modify`` are guarantied to keep the + number of samples unchanged. + +* ``fit_modify`` may not exist (questionnable) + +Design questions and difficulties +................................. + +Should there be a fit method? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In such estimators, it may not be a good idea to call fit rather than +fit_modify (for instance in coreset). + + +How does a pipeline use such an object? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In particular at test time? + +#. Should there be a transform method used at test time? + +#. What to do with objects that implement both ``transform`` and + ``trans_modify``? + +**Creating y in a pipeline makes error measurement harder** For some +usecases, test time needs to modify the number of samples (for instance +data loading from a file). However, these will by construction a problem +for eg cross-val-score, as in supervised settings, these expect a y_true. +Indeed, the problem is the following: + +- To measure an error, we need y_true at the level of + `sklearn.model_selection.cross_val_score` or + `sklearn.model_selection.GridSearchCV` + +- y_true is created inside the pipeline by the data-loading object. + +It is thus unclear that the data-loading usecases can be fully +integrated in the CV framework (which is not an argument against +enabling them). + +| + +For our CV framework, we need the number of samples to remain +constant: for each y_pred, we need a corresponding y_true. + +| + +**Proposal 1**: use transform at ``predict`` time. + +#. Objects implementing both ``transform`` and ``trans_modify`` are valid + +#. The pipeline's ``predict`` method use ``transform`` on its intermediate + steps + +The different semantics of ``trans_modify`` and ``transform`` can be very useful, +as ``transform`` keeps untouched the notion of sample, and ``y_true``. + +| + +**Proposal 2** Modify the scoring framework + +One option is to modify the scoring framework to be able to handle +these things, the scoring gets the output of the chain of +trans_modify for y. This should rely on clever code in the ``score`` method +of pipeline. Maybe it should be controlled by a keyword argument on the +pipeline, and turned off by default. + + +How do we deal with sample weights and other sample properties? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This discussion feeds in the ``sample_props`` discussion (that should +be discussed in a different enhancement proposal). + +The suggestion is to have the sample properties as a dictionary of +arrays ``sample_props``. + +**Example usecase** useful to think about sample properties: coresets: +given (X, y) return (X_new, y_new, weights) with a much smaller number +of samples. + +This example is interesting because it shows that TransModifiers can +legitimately create sample properties. + +**Proposed solution**: + +TransModifiers always return (X_new, y_new, sample_props) where +sample_props can be an empty dictionary. + + +Naming suggestions +.................. + +In term of name choice, the rational would be to have method names that +are close to 'fit' and 'transform', to make discoverability and +readability of the code easier. + +* Name of the object (referred in the docs): + - TransModifier + - TransformPipe + - PipeTransformer + +* Method to fit and apply on training + - fit_modify + - fit_pipe + - pipe_fit + - fit_filter + +* Method to apply on new data + - trans_modify + - transform_pipe + - pipe_transform + +Benefits +........ + +* Many usecases listed above will be implemented scikit-learn without a + meta-estimator, and thus will be easy to use (eg in a pipeline). Many + of these are patterns that we should be encouraging. + +* The API being more versatile, it will be easier to create + application-specific code or framework wrappers (ala DaskLearn) that + are scikit-learn compatible, and thus that can be used with the + parameter-selection framework. This will be especially true for ETL + (extract transform and load) pattern. + +Limitations +........... + +* Introducing new methods, and a new type of estimator object. There are + probably a total of **3 new methods** that will get introduced by this + enhancement: fit_modify, trans_modify, and partial_fit_modify. + +* Cannot solve all possible cases, and thus we will not get rid of + meta-estimators. + +TODO +==== + +* Implement an example doing outlier filtering + +* Implement an example doing data downsampling diff --git a/slep001/subsampler.py b/slep001/subsampler.py new file mode 100644 index 0000000..02620e4 --- /dev/null +++ b/slep001/subsampler.py @@ -0,0 +1,24 @@ +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state + + +class SubSampler(BaseEstimator): + + def __init__(self, ratio=.3, random_state=None): + self.ratio = ratio + self.random_state = random_state + self.random_state_ = None + + def transform_pipe(self, X, y=None): + # Awkward situation: random_state_ is set at transform time :) + if self.random_state_ is None: + self.random_state_ = check_random_state(self.random_state) + n_samples, _ = X.shape + random_choice = self.random_state_.random_sample(n_samples) + random_choice = random_choice < self.ratio + X_out = X[random_choice] + y_out = None + if y is not None: + y_out = y[random_choice] + return X_out, y_out + diff --git a/slep002/proposal.rst b/slep002/proposal.rst new file mode 100644 index 0000000..e2f5901 --- /dev/null +++ b/slep002/proposal.rst @@ -0,0 +1,572 @@ +.. _slep_002: + +========================== +SLEP002: Dynamic pipelines +========================== + +.. topic:: **Summary** + + Create and manipulate pipelines with ease. + +.. contents:: Table of contents + :depth: 3 + +Goals +===== + +* Being backward-compatible +* Allow interactive pipeline construction (for example in IPython) +* Support adding and replacing parts of pipeline +* Support using steps as label (y's) transformers + + +Design +====== + +Imports +------- + +In addition to `Pipeline <sklearn.pipeline.Pipeline>` class some additional +wrappers are proposed as part of public API:: + + from sklearn.pipeline import (Pipeline, fitted, transformer, predictor + label_transformer, label_predictor, + ignore_transform, ignore_predict) + +Pipeline creation +----------------- + +Backward-compatible +................... + +Of course, old syntax should be supported:: + + pipe = Pipeline(steps=[('name1', estimator1), ('name2', 'estimator2)] + +Proposed default constructor +............................ + +It is not backward-compatible, but it shouldn't break most of old code:: + + pipe = Pipeline() + +It is not yet configured, so trying to use it should fail:: + + >>> pipe.predict(...) + Traceback (most recent call last): + ... + NotFittedError: This Pipeline instance is not fitted yet + + >>> pipe.fit(...) + Traceback (most recent call last): + ... + NotConfiguredError: This Pipeline instance is not configured yet + +Proposed construction from iterable of dicts +............................................ + +Dictionaries emphasize structure:: + + pipe = Pipeline( + steps=[ + {'name1': Estimator1()}, + {'name2': Estimator2()}, + ] + ) + +Every dict should be of length 1:: + + >>> pipe = Pipeline( + ... steps=( + ... {'name1': Estimator1(), + ... 'name2': Estimator2()}, + ... {}, + ... ), + ... ) + Traceback (most recent call last): + ... + TypeError: Wrong step definition + + +Proposed construction from ``collections.OrderedDict`` +...................................................... + +It is probably the most natural way to create a pipeline:: + + pipe = Pipeline( + collections.OrderedDict([ + ('name1', Estimator1()), + ('name2', Estimator2()), + ]), + ) + +Backward-compatibility notice +----------------------------- + +As user can provide object of any type as ``steps`` argument to constructor, +there is no way to be 100% compatible, if we are going to maintain our oun +type for ``Pipeline.steps``. +But in most cases people provide ``list`` object as ``steps`` parameter, so +being backward-compatible with ``list`` API should be fine. + +Adding estimators +----------------- + +Backward-compatible +................... + +Although not documented, but popular method of modifying (not fitted) pipelines +should be supported:: + + pipe.steps.append(['name', estimator]) + +The only difference is that special handler is returned instead of ``None``. + +Enhanced: by indexing +..................... + +Using dict-like syntax if very user-friendly:: + + pipe.steps['name'] = estimator + +Enhanced: ``add`` function +.......................... + +Alias to previous two calls:: + + pipe.steps.add('name', estimator) + +And also:: + + pipe.add_estimator('name', estimator) + +Adding estimators with type specification +......................................... + +Estimator types will be discussed later, but some functions belong to this +section:: + + pipe.add_estimator('name0', estimator0).mark_fitted() + pipe.add_transformer('name1', estimator1) # never calls .fit (x, y -> x) + pipe.add_predictor('name2', estimator2) # never calls .trasform (x -> y) + pipe.add_label_transformer('name3', estimator3) # (y -> y) + pipe.add_label_predictor('name4', estimator4) # (y -> y) + +Steps (subestimators) access +---------------------------- + +Backward-compatible +................... + +Indexing by number should return ``(step, estimator)`` pair:: + + >>> pipe.steps[0] + ('name', SomeEstimator(...)) + +Enhanced access via indexing +............................ + +One should be able to retrieve any estimator with indexing by step's name:: + + >>> pipe.steps['mame'] + SomeEstimator(param1=value1, param2=value2) + +Enhanced access via attributes +.............................. + +Dotted access should also work if name of step is valid python name literal +and there is no inference with internal methods:: + + >>> pipe.steps.name + SomeEstimator(param1=value1, param2=value2) + + >>> pipe.steps.get + <bound method index of <StepsOrderedDict object at ...>> + + >>> pipe.add_transformer('my transformer', estimator) + >>> pipe.steps.my transformer + File ... + pipe.steps_.my transformer + ^ + SyntaxError: invalid syntax + +Replacing estimators +-------------------- + +Backward-compatible +................... + +Replacing should only be supported via access to ``.steps`` attribute. This way +there is no ambiguity with new/old subestimator subtype:: + + pipe = Pipeline(steps=[('name', SomeEstimator())]) + pipe.steps[0] = ('name', AnotherEstimator()) + +Replace via indexing by step name +................................. + +Dict-like behavior can be used too:: + + pipe = Pipeline(steps=[('name', SomeEstimator())]) + pipe.steps['name'] = AnotherEstimator() + +Replace via ``replace()`` function +.................................. + +This way one can obtain special handler:: + + pipe.steps.replace('old_step_name', 'new_step_name', NewEstimator()) + pipe.steps.replace('step_name', 'new_name', + SomeEstimator()).mark_transformer() + + +Rename step via ``rename()`` function +..................................... + +Simple way to change step's name (doesn't affect anything except object +representation):: + + pipe.steps.rename('old_name', 'new_name') + +Modifying estimators +-------------------- + +Changing estimator params should only be performed via +``pipeline.set_params()``. If somebody calls ``subestimator.set_params()`` +directly, pipeline object will have no idea about changed state. There is no +easy way to control it, so docs should just warm users about it. + +On the other hand, there exist not-so-easy way to at least warm users during +runtime: pipeline will have to keep params of all its children and compare them +with actual params during ``fit`` or ``predict`` routines and raise a warning +if they do not match. This functionality may be implemented as part of some +kind of debugging mode. + +Deleting estimators +------------------- + +Backward-compatible +................... + +Backward-compatible way to delete a step is to ``del`` it via index number:: + + del pipe.steps[2] + +Enhanced indexing +................. + +A little more user-friendly way to remove a step can be achieved +using enhanced indexing:: + + pipe = Pipeline() + est1 = Estimator1() + est2 = Estimator2() + + pipe.steps.add('name1', est1) + pipe.steps.add('name2', est2) + + del pipe.steps['name1'] + del pipe.steps[pipe.steps.index(est2)] + +Using dict/list-like ``pop()`` functions +........................................ + +Last estimator in a chain can be deleted with any of these calls:: + + >>> pipe.steps.pop() + SomeEstimator() + + >>> pipe.steps.popitem() + ('some_name', SomeEstimator()) + +Likewise, first estimator in the pipeline can be removed with any of these +calls:: + + >>> pipe.steps.popfront() + BeginEstimator() + + >>> pipe.steps.popitemfront() + ('begin', BeginEstimator) + +Any step can be removed with ``pop(step_name)`` or ``popitem(step_name)``. + +Fitted flag reset +----------------- + +Internally ``Pipeline`` object should keep track on whatever it is fitted or +not. It should consider itself fitted if it wasn't modified after: + +* successful call to ``.fit``:: + + pipe.fit(...) # Got fitted pipeline if no exception was raised + +* construction with list of estimators, all marked as + fitted via ``fitted`` function:: + + pipe = pipeline.Pipeline(steps=[ + ('name1', fitted(estimator1)), + ('name2', fitted(estimator2)(, + ... + ]) + +* adding fitted estimator to fitted pipeline:: + + pipe.steps.append(fitted(estimator1)) + pipe.steps['new_step'] = fitted(estimator2) + pipe.add_transformer('some_key', estimator3).set_fitted() + +* renaming step in fitted pipeline +* removing first or last step from fitted pipeline + +Subestimator types +------------------ + +Subestimator type contains information about the way a pipeline +should process a step with that subestimator. + +Subestimator type can be specified: + +* By wrapping estimator with subtype constructor call: + * when creating pipeline:: + + Pipeline([ + ('name1', transformer(estimator)), + ('name2', predictor(estimator)), + ('name3', label_transformer(estimator)), + ('name4', label_predictor(estimator)), + ]) + * when adding or replacing a step:: + + pipe.steps.append(['name', label_predictor(estimator]) + pipe.steps.add('name', label_transformer(estimator)) + pipe.add_estimator('name', predictor(estimator)) + pipe.steps.replace('name', transformer(fitted(estimator))) + pipe.steps['name'] = fitted(predictor(estimator)) +* Using ``pipe.add_*`` methods:: + + pipe.add_transformer('transformer', Transformer()) + pipe.add_predictor('predictor', Predictor()) + pipe.add_label_transformer('l_transformer', LabelTransformer()) + pipe.add_label_predictor('l_predictor', LabelPredictor()) +* Using special handler methods:: + + pipe.add_estimator('name1', EstimatorA()).mark_transformer() + pipe.steps.add('name2', EstimatorB()).mark_predictor() + pipe.steps.append(['name3', EstimatorC()]).mark_label_transformer() + pipe.steps.replace('name4', EstimatorD()).mark_label_predictor() + pipe.steps.replace('name4', EstimatorE()).mark('label_transformer') + +Transformer +........... +Is a default type. + +It is processed like this:: + + y_new = y + if fiting: + X_new = step_estimator.fit_transform(X, y) + else: + X_new = step.transform(X, y) + +Predictor +......... + +It is processed like this:: + + X_new = X + if fitting: + y_new = step_estimator.fit_predict(X, y) + else: + y_new = step_estimator.predict(X, y) + +Label transformer +................. + +Processing pseudocode:: + + X_new = X + if fitting: + y_new = step_estimator.fit_transform(y) + else: + y_new = step_estimator.transform(y) + +Label predictor +............... + +Processing pseudocode:: + + X_new = X + if fitting: + y_new = step_estimator.fit_predict(y) + else: + y_new = step_estimator.predict(y) + +Special handlers and wrapper functions +-------------------------------------- + +Assuming estimator is already fitted +.................................... + +to add estimator, that was already fitted to a pipline +one can use fitted function:: + + est = SomeEstimator().fit(some_data) + pipe.steps.add('prefitted', fitted(est)) + +or special hanlder method:: + + pipe.steps.add('prefitted', est).mark_fitted() + # or + pipe.steps.add('prefitted', est).mark('fitted') + +Ignoring estimator during prediction +.................................... + +In some cases we only need to apply estimator only during fit-phase:: + + pipe.add_estimator('sampler', ignore_transform(Sampler())) + # or + pipe.add_estimator('sampler', Sampler()).mark_ignore_transform() + # or + pipe.add_estimator('sampler', Sampler()).mark('ignore_transform') + +If it is ``predictor`` or ``label_predictor``, then one should use +``ignore_predict``:: + + pipe.add_estimator('cluster', ignore_predict(predictor(ClusteringEstimator()))) + # or + pipe.add_estimator('cluster', predictor(ClusteringEstimator())).mark_ignore_predict() + # or + pipe.add_estimator('cluster', predictor(ClusteringEstimator())).mark('ignore_predict') + +Setting subestimator type +......................... + +As specified above setting subestimator type can be performed with special +handler or special function call. + +Combining multiple flags +........................ + +All sorts of syntax combinations should be supported:: + + pipe.steps.add('step', fitted(predictor(Estimator()))) + pipe.steps.add('step', predictor(fitted(Estimator()))) + pipe.steps.add('step', predictor(Estimator())).mark_fitted() + pipe.steps.add('step', fitted(Estimator())).mark_predictor() + pipe.steps.add('step', Estimator()).mark_predictor().mark_fitted() + pipe.steps.add('step', Estimator()).mark_fitted().mark_predictor() + pipe.steps.add('step', Estimator()).mark('fitted').mark_predictor() + pipe.steps.add('step', Estimator()).mark('predictor').mark_fitted() + pipe.steps.add('step', Estimator()).mark('predictor').mark('fitted') + pipe.steps.add('step', Estimator()).mark('fitted').mark('predictor') + pipe.steps.add('step', Estimator()).mark('fitted', 'predictor') + pipe.steps.add('step', Estimator()).mark('predictor', 'fitted') + +Type of steps object +-------------------- + +This is internal type, users shouldn'r usualy mess with that. +But public methods should be considered as part of pipeline API. + +Attributes and methods with standard behavior +.............................................. + +Special methods: + +* ``__contains__()``, ``__getitem__()``, ``__setitem__()``, ``__delitem__()`` +* ``__len__()``, ``__iter__()`` +* ``__add__()``, ``__iadd__()`` + +Methods: + +* ``get()``, ``index()`` +* ``extend()``, ``insert()`` +* ``keys()``, ``items()``, ``values()`` +* ``clear()``, ``pop()``, ``popitem()``, ``popfront()``, ``popitemfront()`` + +Non-standard methods +.................... + +* ``replace()`` +* ``rename()`` + +Not supported arguments and methods +................................... + +This type provides dict-like and list-like interfaces, +but following methods and attributes are not supported: + +* ``fromkeys()`` +* ``setdefault()`` +* ``sort()`` +* ``__mul__()``, ``__rmul__()``, ``__imul__()`` + +Any attempt to use them should fail with ``AttributeError`` or +``NotImplementedError`` + +Thease methods may be not supported: + +* ``__ge__()``, ``__gt__()`` +* ``__le__()``, ``__lt__()`` + +Serialization +------------- + +* Support loading/unpickling pipelines from old scikit-learn versions +* Keep track of API version in ``__getstate__`` / ``picklier``: all future + versions should support unpickling all previous versions of enhanced pipeline +* Serialization of ``.steps`` attribute (without master pipeline) may be not + supported. + +Examples +======== + +Example: remove outliers +------------------------ + +Proposed design allows to do many things, but some of them have to be done in +two steps. But it shouldn't be a problem, as one can make a pipeline with +those steps:: + + def make_outlier_remover(bad_value=-1): + outlier_remover = Pipeline() + outlier_remover.steps.add( + 'data', + DropLinesOfXCorrespondingLabel(remove_if=bad_value), + ) + outlier_remover.steps.add( + 'labels', + DropLabelsIf(remove_if=bad_value), + ).mark_label_transformer() + return outlier_remover + +Example: sample dataset +----------------------- +We can use previous example function for this:: + + def make_sampler(percent=75): + sentinel = object() + sampler = Pipeline() + sampler.steps.add( + 'sample', + LabelSomeRowsAs(percent=percent, label=sentinel), + ).mark('predictor', 'ignore_predict') + sampler.steps.add( + 'down', + make_outlier_remover(bad_value=sentinel), + ) + return sampler + +Benefits +======== +* Users can use old code with new pipeline: + usual ``__init__``, ``set_params``, ``get_params``, ``fit``, ``transform`` + and ``predict`` are the only requirements of subestimators. +* Users can use new pipeline with their old code: + pipeline is stil usual estimator, that supports usual set of methods. +* We finally can transform ``y`` in a pipeline. + +Drawbacks +========= +Well, it's a lot of code to write and support... diff --git a/slep003/proposal.rst b/slep003/proposal.rst index de4ed39..d4bf10f 100644 --- a/slep003/proposal.rst +++ b/slep003/proposal.rst @@ -1,16 +1,17 @@ -====================================== -Consistent inspection for transformers -====================================== +.. _slep_003: -. topic:: **Summary** +=============================================== +SLEP003: Consistent inspection for transformers +=============================================== + +.. topic:: **Summary** Inspect transformers' output shape and dependence on input features consistently with ``get_feature_dependence() -> boolean (n_outputs, n_inputs)`` -.. sectnum:: - - :depth: 3 +.. contents:: Table of contents + :depth: 2 Goals ===== diff --git a/slep004/proposal.rst b/slep004/proposal.rst new file mode 100644 index 0000000..195076f --- /dev/null +++ b/slep004/proposal.rst @@ -0,0 +1,411 @@ +.. _slep_004: + +========================= +SLEP004: Data information +========================= + +:Author: Nicolas Hug +:Status: Withdrawn (superseded by :ref:`SLEP006 <slep_006>`) +:Type: Standards Track +:Created: 2018-12-12 + +This is a specification to introduce data information (as +``sample_weights``) during the computation of an estimator methods +(``fit``, ``score``, ...) based on the different discussion proposes on +issues and PR : + +- `Consistent API for attaching properties to samples + #4497 <https://github.com/scikit-learn/scikit-learn/issues/4497>`__ +- `Acceptance of sample\_weights in pipeline.score + #7723 <https://github.com/scikit-learn/scikit-learn/pull/7723>`__ +- `Establish global error state like np.seterr + #4660 <https://github.com/scikit-learn/scikit-learn/issues/4660>`__ +- `Should cross-validation scoring take sample-weights into account? + #4632 <https://github.com/scikit-learn/scikit-learn/issues/4632>`__ +- `Sample properties + #4696 <https://github.com/scikit-learn/scikit-learn/issues/4696>`__ + +Probably related PR: - `Add feature\_extraction.ColumnTransformer +#3886 <https://github.com/scikit-learn/scikit-learn/pull/3886>`__ - +`Categorical split for decision tree +#3346 <https://github.com/scikit-learn/scikit-learn/pull/3346>`__ + +Google doc of the sample\_prop discussion done during the sklearn day in +paris the 7th June 2017: +https://docs.google.com/document/d/1k8d4vyw87gWODiyAyQTz91Z1KOnYr6runx-N074qIBY/edit + +.. contents:: Table of contents + :depth: 2 + +1. Requirement +============== + +These requirements are defined from the different issues and PR +discussions: + +- User can attach information to samples. +- Must be a DataFrame like object. +- Can be given to ``fit``, ``score``, ``split`` and every time user + give X. +- Must work with every meta-estimator + (``Pipeline, GridSearchCV, cross_val_score``). +- Can specify what sample property is used by each part of the + meta-estimator. +- Must raise an error if not necessary extra information are given to + an estimator. In the case of meta-estimator these errors are not + raised. + +Requirement proposed but not used by this specification: - User can +attach feature properties to samples. + +2. Definition +============= + +Some estimator in sklearn can change their behavior when an attribute +``sample_props`` is provided. ``sample_props`` is a dictionary +(``pandas.DataFrame`` compatible) defining sample properties. The +example bellow explain how a ``sample_props`` can be provided to +LogisticRegression to weighted the samples: + +.. code:: python + + import numpy as np + from sklearn import datasets + from sklearn.linear_model import LogisticRegression + + digits = datasets.load_digits() + X = digits.data + y = digits.target + + # Define weights used by sample_props + weights_fit = np.random.rand(X.shape[0]) + weights_fit /= np.sum(weights_fit) + weights_score = np.random.rand(X.shape[0]) + weights_score /= np.sum(weights_score) + + logreg = LogisticRegression() + + # Fit and score a LogisticRegression without sample weights + logreg = logreg.fit(X, y) + score = logreg.score(X, y) + print("Score obtained without applying weights: %f" % score) + + # Fit LogisticRegression without sample weights and score with sample weights + logreg = logreg.fit(X, y) + score = logreg.score(X, y, sample_props={'weight': weights_score}) + print("Score obtained by applying weights only to score: %f" % score) + + # Fit and score a LogisticRegression with sample weights + log_reg = logreg.fit(X, y, sample_props={'weight': weights_fit}) + score = logreg.score(X, y, sample_props={'weight': weights_score}) + print("Score obtained by applying weights to both" + " score and fit: %f" % score) + +When an estimator expects a mandatory ``sample_props``, an error is +raised for each property not provided. Moreover if an unintended +properties is given through ``sample_props``, a warning will be +launched to prevent that the result may be different from the one +expected. For example, the following code : + +.. code:: python + + import numpy as np + from sklearn import datasets + from sklearn.cluster import KMeans + from sklearn.pipeline import Pipeline + + digits = datasets.load_digits() + X = digits.data + y = digits.target + weights = np.random.rand(X.shape[0]) + + logreg = LogisticRegression() + + # This instruction will raise the warning + logreg = logreg.fit(X, y, sample_props={'bad_property': weights}) + +will **raise the warning message**: "sample\_props['bad\_property'] is +not used by ``LogisticRegression.fit``. The results obtained may be +different from the one expected." + +We provide the function ``sklearn.seterr`` in the case you want to +change the behavior of theses messages. Even if there are considered as +warnings by default, we recommend to change the behavior to raise as +errors. You can do it by adding the following code: + +.. code:: python + + sklearn.seterr(sample_props="raise") + +Please refer to the documentation of ``np.seterr`` for more information. + +3. Behavior of ``sample_props`` for meta-estimator +================================================== + +3.1 Common routing scheme +------------------------- + +Meta-estimators can also change their behavior when an attribute +``sample_props`` is provided. On that case, ``sample_props`` will be +sent to any internal estimator and function supporting the +``sample_props`` attribute. In other terms all the property defined by +``sample_props`` will be transmitted to each internal functions or +classes supporting ``sample_props``. For example in the following +example, the property ``weights`` is sent through ``sample_props`` to +``pca.fit_transform`` and ``logistic.fit``: + +.. code:: python + + import numpy as np + from sklearn import decomposition, datasets, linear_model + from sklearn.pipeline import Pipeline + + digits = datasets.load_digits() + X = digits.data + y = digits.target + + logistic = linear_model.LogisticRegression() + pca = decomposition.PCA() + pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic),]) + + # Define weights + weights = np.random.rand(X.shape[0]) + weights /= np.sum(weights) + + # weights is send to pca.fit_transform and logistic.fit + pipe.fit(X, sample_props={"weights": weights}) + +By contrast with the estimator, no warning will be raised by a +meta-estimator if an extra property is sent through ``sample_props``. +Anyway, errors are still raised if a mandatory property is not provided. + +3.2 Override common routing scheme +---------------------------------- + +You can override the common routing scheme of ``sample_props`` of +nested objects by defining sample properties of the form +``<component>__<property>``. + +You can override the common routing scheme of ``sample_props`` by +defining your own routes through the ``routing`` attribute of a +meta-estimator. + +A route defines a way to override the value of a key of +``sample_props`` by the value of another key in the same +``sample_props``. This modification is done every time a method +compatible with ``sample_prop`` is called. + +To illustrate how it works, if you want to send ``weights`` only to +``pca``, you can define a ``sample_prop`` with a property +``pca__weights``: + +.. code:: python + + import numpy as np + from sklearn import decomposition, datasets, linear_model + from sklearn.pipeline import Pipeline + + digits = datasets.load_digits() + X = digits.data + y = digits.target + + logistic = linear_model.LogisticRegression() + pca = decomposition.PCA() + + # Create a route using routing + pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic),]) + + # Define weights + weights = np.random.rand(X.shape[0]) + weights /= np.sum(pca_weights) + pca_weights = np.random.rand(X.shape[0]) + pca_weights /= np.sum(pca_weights) + + # Only pca will receive pca_weights as weights + pipe.fit(X, sample_props={'pca__weights': pca_weights}) + + # pca will receive pca_weights and logistic will receive weights as weights + pipe.fit(X, sample_props={'pca__weights': pca_weights, + 'weights': weights}) + +By defining ``pca__weights``, we have overridden the property +``weights`` for ``pca``. On all cases, the property ``pca__weights`` +will be send to ``pca`` and ``logistic``. + +Overriding the routing scheme can be subtle and you must remember the +priority of application of each route types: + +1. Routes applied specifically to a function/estimator: + ``{'pca__weights': weights}}`` +2. Routes defined globally: ``{'weights': weights}`` + +Let's consider the following code to familiarized yourself with the +different routes definitions : + +.. code:: python + + import numpy as np + from sklearn import datasets + from sklearn.linear_model import SGDClassifier + from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneLabelOut + + digits = datasets.load_digits() + X = digits.data + y = digits.target + + # Define the groups used by cross_val_score + cv_groups = np.random.randint(3, size=y.shape) + + # Define the groups used by GridSearchCV + gs_groups = np.random.randint(3, size=y.shape) + + # Define weights used by cross_val_score + weights = np.random.rand(X.shape[0]) + weights /= np.sum(weights) + + # We define the GridSearchCV used by cross_val_score + grid = GridSearchCV(SGDClassifier(), params, cv=LeaveOneLabelOut()) + + # When cross_val_score is called, we send all parameters for internal values + cross_val_score(grid, X, y, cv=LeaveOneLabelOut(), + sample_props={'cv__groups': groups, + 'split__groups': gs_groups, + 'weights': weights}) + +With this code, the ``sample_props`` sent to each function of +``GridSearchCV`` and ``cross_val_score`` will be: + ++-------------+--------------------------------------------------------------+ +| function | ``sample_props`` | ++=============+==============================================================+ +| grid.fit | ``{'weights': weights, 'cv__groups': cv_groups, split_groups | +| | ': gs_groups}`` | ++-------------+--------------------------------------------------------------+ +| grid.score | ``{'weights': weights, 'cv__groups': cv_groups, split_groups | +| | ': gs_groups}`` | ++-------------+--------------------------------------------------------------+ +| grid.split | ``{'weights': weights, 'groups': gs_groups, 'cv__groups': cv | +| | _groups, split_groups': gs_groups}`` | ++-------------+--------------------------------------------------------------+ +| cross\_val\ | ``{'weights': weights, 'groups': groups, 'cv__groups': cv_gr | +| _score | oups, split_groups': gs_groups}`` | ++-------------+--------------------------------------------------------------+ + +Thus, these functions receive as ``weights`` and ``groups`` properties : + ++---------------------+---------------+-----------------+ +| function | ``weights`` | ``groups`` | ++=====================+===============+=================+ +| grid.fit | ``weights`` | ``None`` | ++---------------------+---------------+-----------------+ +| grid.score | ``weights`` | ``None`` | ++---------------------+---------------+-----------------+ +| grid.split | ``weights`` | ``gs_groups`` | ++---------------------+---------------+-----------------+ +| cross\_val\_score | ``weights`` | ``cv_groups`` | ++---------------------+---------------+-----------------+ + +4. Alternative propositions for sample\_props (06.17.17) +======================================================== + +The meta-estimator says which columns of sample\_props they wanted to +use. + +.. code:: python + + p = make_pipeline( + PCA(n_components=10), + SVC(C=10).with(<method>_<thing_the_method_knows>=<column_name>) + ) + p.fit(X, y, sample_props={column_name=value}) + +For example : + +.. code:: python + + p = make_pipeline( + PCA(n_components=10), + SVC(C=10).with(fit_weights='weights', score_weights='weights') + ) + p.fit(X, y, sample_props={"weights": w}) + +**Other proposals**: - Olivier suggests to modify ``.with(...)`` by +``.sample_props_mapping(...)``. - Gael suggests to change the +``.with(...)`` by a property ``with_props=...`` like : + +.. code:: python + + p = make_pipeline( + PCA(n_components=10), + SVC(C=10), + with_props={ + 'svc':(<method>_<thing_the_method_knows>=<column_name>)} + ) + +4.1 GridSearch + Pipeline case +------------------------------ + +Let's consider the case of a ``GridSearch`` working with a ``Pipeline``. +How we definer the ``sample_props`` on that case ? + +Alternative 1 +~~~~~~~~~~~~~ + +Pass through everything in ``GridSearchCV``: + +.. code:: python + + pipe = make_pipeline( + PCA(), SVC(), + with_props={pca__fit_weight: 'my_weights'}}) + GridSearchCV( + pipe, cv=my_cv, + with_props={'cv__groups': "my_groups", '*':'*') + +A more complex example with this solution: + +.. code:: python + + pipe = make_pipeline( + make_union( + CountVectorizer(analyzer='word').with(fit_weight='my_weight'), + CountVectorizer(analyzer='char').with(fit_weight='my_weight')), + SVC()) + + GridSearchCV( + pipe, + cv=my_cv.with(groups='my_groups'), score_weight='my_weight') + +Alternative 2 +~~~~~~~~~~~~~ + +Grid search manage the ``sample_props`` of all internal variable. + +.. code:: python + + pipe = make_pipeline(PCA(), SVC()) + GridSearchCV( + pipe, cv=my_cv, + with_props={ + 'cv__groups': "my_groups", + 'estimator__pca__fit_weight': "my_weights"), +       }) + +A more complex example with this solution: + +.. code:: python + + pipe = make_pipeline( + make_union( + CountVectorizer(analyzer='word'), + CountVectorizer(analyzer='char')), + SVC()) + GridSearchCV( + pipe, cv=my_cv, + with_props={ + 'cv__groups': "my_groups", + 'estimator__featureunion__countvectorizer-1__fit_weight': "my_weights", + 'estimator__featureunion__countvectorizer-2__fit_weight': "my_weights", + 'score_weight': "my_weights", + } + ) diff --git a/slep006/cases_opt0a.py b/slep006/cases_opt0a.py new file mode 100644 index 0000000..96a3206 --- /dev/null +++ b/slep006/cases_opt0a.py @@ -0,0 +1,97 @@ +import numpy as np + +from defs import (GroupKFold, get_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights) + +# %% +# Case A: weighted scoring and fitting + + +GROUPS_IDX = -1 +WEIGHT_IDX = -2 + + +def unwrap_X(X): + return X[:, -2:] + + +class WrappedGroupCV: + def __init__(self, base_cv, groups_idx=GROUPS_IDX): + self.base_cv = base_cv + self.groups_idx = groups_idx + + def split(self, X, y, groups=None): + groups = X[:, self.groups_idx] + return self.base_cv.split(unwrap_X(X), y, groups=groups) + + def get_n_splits(self, X, y, groups=None): + groups = X[:, self.groups_idx] + return self.base_cv.get_n_splits(unwrap_X(X), y, groups=groups) + + +wrapped_group_cv = WrappedGroupCV(GroupKFold()) + + +class WrappedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) + + +acc_scorer = get_scorer('accuracy') + + +def wrapped_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) + + +lr = WrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case B: weighted scoring and unweighted fitting + +class UnweightedWrappedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(unwrap_X(X), y) + + +lr = UnweightedWrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + + +# %% +# Case C: unweighted feature selection + +class UnweightedWrappedSelectKBest(SelectKBest): + def fit(self, X, y): + return super().fit(unwrap_X(X), y) + + +lr = WrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +sel = UnweightedWrappedSelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +SCORING_WEIGHT_IDX = -3 + +# TODO: proceed from here. Note that this change implies the need to add +# a parameter to unwrap_X, since we will now append an additional column to X. diff --git a/slep006/cases_opt0b.py b/slep006/cases_opt0b.py new file mode 100644 index 0000000..89ae365 --- /dev/null +++ b/slep006/cases_opt0b.py @@ -0,0 +1,91 @@ +import pandas as pd +from defs import (get_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +X = pd.DataFrame(X) +MY_GROUPS = pd.Series(my_groups) +MY_WEIGHTS = pd.Series(my_weights) +MY_OTHER_WEIGHTS = pd.Series(my_other_weights) + +# %% +# Case A: weighted scoring and fitting + + +class WrappedGroupCV: + def __init__(self, base_cv): + self.base_cv = base_cv + + def split(self, X, y, groups=None): + return self.base_cv.split(X, y, groups=MY_GROUPS.loc[X.index]) + + def get_n_splits(self, X, y, groups=None): + return self.base_cv.get_n_splits(X, y, groups=MY_GROUPS.loc[X.index]) + + +wrapped_group_cv = WrappedGroupCV(GroupKFold()) + + +class WeightedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(X, y, sample_weight=MY_WEIGHTS.loc[X.index]) + + +acc_scorer = get_scorer('accuracy') + + +def wrapped_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, X, y, sample_weight=MY_WEIGHTS.loc[X.index]) + + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case B: weighted scoring and unweighted fitting + +lr = LogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + + +# %% +# Case C: unweighted feature selection + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case D: different scoring and fitting weights + + +def other_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, X, y, sample_weight=MY_OTHER_WEIGHTS.loc[X.index]) + + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=other_weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, + cv=wrapped_group_cv, + scoring=other_weighted_acc) diff --git a/slep006/cases_opt1.py b/slep006/cases_opt1.py new file mode 100644 index 0000000..94351df --- /dev/null +++ b/slep006/cases_opt1.py @@ -0,0 +1,68 @@ +from defs import (accuracy_score, GroupKFold, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, make_pipeline, X, y, + my_groups, my_weights, my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', +) +cross_validate(lr, X, y, cv=GroupKFold(), + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy') + +# Error handling: if props={'sample_eight': my_weights, ...} was passed +# instead, the estimator would fit and score without weight, silently failing. + +# %% +# Case B: weighted scoring and unweighted fitting + + +class MyLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y, props=None): + props = props.copy() + props.pop('sample_weight', None) + super().fit(X, y, props=props) + + +# %% +# Case C: unweighted feature selection + +# Currently feature selection does not handle sample_weight, and as long as +# that remains the case, it will simply ignore the prop passed to it. Hence: + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', +) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, cv=GroupKFold(), + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy') + +# %% +# Case D: different scoring and fitting weights + +weighted_acc = make_scorer(accuracy_score) + + +def specially_weighted_acc(est, X, y, props): + props = props.copy() + props['sample_weight'] = 'scoring_weight' + return weighted_acc(est, X, y, props) + + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring=specially_weighted_acc, +) +cross_validate(lr, X, y, cv=GroupKFold(), + props={ + 'scoring_weight': my_weights, + 'sample_weight': my_other_weights, + 'groups': my_groups, + }, + scoring=specially_weighted_acc) diff --git a/slep006/cases_opt2.py b/slep006/cases_opt2.py new file mode 100644 index 0000000..5c63d3d --- /dev/null +++ b/slep006/cases_opt2.py @@ -0,0 +1,70 @@ +from defs import (GroupKFold, SelectKBest, LogisticRegressionCV, + cross_validate, make_pipeline, X, y, my_groups, + my_weights, my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', +) +props = {'cv__groups': my_groups, + 'estimator__cv__groups': my_groups, + 'estimator__sample_weight': my_weights, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(lr, X, y, cv=GroupKFold(), + props=props, + scoring='accuracy') + +# error handling: if props={'estimator__sample_eight': my_weights, ...} was +# passed instead, the estimator would raise an error. + +# %% +# Case B: weighted scoring and unweighted fitting + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', +) +props = {'cv__groups': my_groups, + 'estimator__cv__groups': my_groups, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(lr, X, y, cv=GroupKFold(), + props=props, + scoring='accuracy') + +# %% +# Case C: unweighted feature selection + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', +) +pipe = make_pipeline(SelectKBest(), lr) +props = {'cv__groups': my_groups, + 'estimator__logisticregressioncv__cv__groups': my_groups, + 'estimator__logisticregressioncv__sample_weight': my_weights, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(pipe, X, y, cv=GroupKFold(), + props=props, + scoring='accuracy') + +# %% +# Case D: different scoring and fitting weights + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', +) +props = {'cv__groups': my_groups, + 'estimator__cv__groups': my_groups, + 'estimator__sample_weight': my_other_weights, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(lr, X, y, cv=GroupKFold(), + props=props, + scoring='accuracy') diff --git a/slep006/cases_opt3.py b/slep006/cases_opt3.py new file mode 100644 index 0000000..fff317d --- /dev/null +++ b/slep006/cases_opt3.py @@ -0,0 +1,99 @@ +from defs import (SelectKBest, LogisticRegressionCV, + GroupKFold, cross_validate, make_pipeline, X, y, my_groups, + my_weights, my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', + prop_routing={'cv': ['groups'], + 'scoring': ['sample_weight'], + } + # one question here is whether we need to explicitly route sample_weight + # to LogisticRegressionCV's fitting... +) + +# Alternative syntax, which assumes cv receives 'groups' by default, and that a +# method-based API is provided on meta-estimators: +# lr = LogisticRegressionCV( +# cv=GroupKFold(), +# scoring='accuracy', +# ).add_prop_route(scoring='sample_weight') + +cross_validate(lr, X, y, cv=GroupKFold(), + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy', + prop_routing={'estimator': '*', # pass all props + 'cv': ['groups'], + 'scoring': ['sample_weight'], + }) + +# Error handling: if props={'sample_eight': my_weights, ...} was passed +# instead, LogisticRegressionCV would have to identify that a key was passed +# that could not be routed nor used, in order to raise an error. + +# %% +# Case B: weighted scoring and unweighted fitting + +# Here we rename the sample_weight prop so that we can specify that it only +# applies to scoring. +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', + prop_routing={'cv': ['groups'], + # read the following as "scoring should consume + # 'scoring_weight' as if it were 'sample_weight'." + 'scoring': {'sample_weight': 'scoring_weight'}, + }, +) +cross_validate(lr, X, y, cv=GroupKFold(), + props={'scoring_weight': my_weights, 'groups': my_groups}, + scoring='accuracy', + prop_routing={'estimator': '*', + 'cv': ['groups'], + 'scoring': {'sample_weight': 'scoring_weight'}, + }) + +# %% +# Case C: unweighted feature selection + +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', + prop_routing={'cv': ['groups'], + 'scoring': ['sample_weight'], + }) +pipe = make_pipeline(SelectKBest(), lr, + prop_routing={'logisticregressioncv': ['sample_weight', + 'groups']}) +cross_validate(lr, X, y, cv=GroupKFold(), + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy', + prop_routing={'estimator': '*', + 'cv': ['groups'], + 'scoring': ['sample_weight'], + }) + +# %% +# Case D: different scoring and fitting weights +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring='accuracy', + prop_routing={'cv': ['groups'], + # read the following as "scoring should consume + # 'scoring_weight' as if it were 'sample_weight'." + 'scoring': {'sample_weight': 'scoring_weight'}, + }, +) +cross_validate(lr, X, y, cv=GroupKFold(), + props={'scoring_weight': my_weights, 'groups': my_groups, + 'fitting_weight': my_other_weights}, + scoring='accuracy', + prop_routing={'estimator': {'sample_weight': 'fitting_weight', + 'scoring_weight': 'scoring_weight', + 'groups': 'groups'}, + 'cv': ['groups'], + 'scoring': {'sample_weight': 'scoring_weight'}, + }) diff --git a/slep006/cases_opt4.py b/slep006/cases_opt4.py new file mode 100644 index 0000000..84c8633 --- /dev/null +++ b/slep006/cases_opt4.py @@ -0,0 +1,78 @@ +from defs import (accuracy_score, GroupKFold, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +# Here we presume that GroupKFold requests `groups` by default. +# We need to explicitly request weights in make_scorer and for +# LogisticRegressionCV. Both of these consumers understand the meaning +# of the key "sample_weight". + +weighted_acc = make_scorer(accuracy_score, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring=weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, cv=GroupKFold(), + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# Error handling: if props={'sample_eight': my_weights, ...} was passed, +# cross_validate would raise an error, since 'sample_eight' was not requested +# by any of its children. + +# %% +# Case B: weighted scoring and unweighted fitting + +# Since LogisticRegressionCV requires that weights explicitly be requested, +# removing that request means the fitting is unweighted. + +weighted_acc = make_scorer(accuracy_score, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring=weighted_acc, +) +cross_validate(lr, X, y, cv=GroupKFold(), + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case C: unweighted feature selection + +# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly. +# Here it does not request them. + +weighted_acc = make_scorer(accuracy_score, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring=weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, cv=GroupKFold(), + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +# Despite make_scorer and LogisticRegressionCV both expecting a key +# sample_weight, we can use aliases to pass different weights to different +# consumers. + +weighted_acc = make_scorer(accuracy_score, + request_props={'scoring_weight': 'sample_weight'}) +lr = LogisticRegressionCV( + cv=GroupKFold(), + scoring=weighted_acc, +).set_props_request({'fitting_weight': "sample_weight"}) +cross_validate(lr, X, y, cv=GroupKFold(), + props={ + 'scoring_weight': my_weights, + 'fitting_weight': my_other_weights, + 'groups': my_groups, + }, + scoring=weighted_acc) diff --git a/slep006/cases_opt4b.py b/slep006/cases_opt4b.py new file mode 100644 index 0000000..5f8fd4d --- /dev/null +++ b/slep006/cases_opt4b.py @@ -0,0 +1,95 @@ +from defs import (accuracy_score, GroupKFold, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +# Here we presume that GroupKFold requests `groups` by default. +# We need to explicitly request weights in make_scorer and for +# LogisticRegressionCV. Both of these consumers understand the meaning +# of the key "sample_weight". + +weighted_acc = make_scorer(accuracy_score, request_metadata=['sample_weight']) +group_cv = GroupKFold() +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit=True) # same as `fit=['sample_weight']` +cross_validate(lr, X, y, cv=group_cv, + metadata={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# Here lr.get_metadata_request() would return +# {'fit': {'groups': {'groups'}, 'sample_weight': {'sample_weight'}}, +# 'predict': {}, +# 'transform': {}, +# 'score': {}, +# 'split': {}, +# 'inverse_transform': {}} + +# Error handling: if metadata={'sample_eight': my_weights, ...} was passed, +# cross_validate would raise an error, since 'sample_eight' was not requested +# by any of its children. + +# %% +# Case B: weighted scoring and unweighted fitting + +# Since LogisticRegressionCV requires that weights explicitly be requested, +# removing that request means the fitting is unweighted. + +weighted_acc = make_scorer(accuracy_score, request_metadata=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit=False) # if not specified an exception is raised +cross_validate(lr, X, y, cv=group_cv, + metadata={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# Here lr.get_metadata_request() would return +# {'fit': {'groups': {'groups'}}, +# 'predict': {}, +# 'transform': {}, +# 'score': {}, +# 'split': {}, +# 'inverse_transform': {}} + +# %% +# Case C: unweighted feature selection + +# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly. +# Here it does not request them. + +weighted_acc = make_scorer(accuracy_score, request_metadata=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit=True) +sel = SelectKBest().request_sample_weight(fit=False) +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, cv=group_cv, + metadata={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +# Despite make_scorer and LogisticRegressionCV both expecting a key +# sample_weight, we can use aliases to pass different weights to different +# consumers. + +weighted_acc = make_scorer(accuracy_score, + request_metadata={'scoring_weight': 'sample_weight'}) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit='fitting_weight') +cross_validate(lr, X, y, cv=group_cv, + metadata={ + 'scoring_weight': my_weights, + 'fitting_weight': my_other_weights, + 'groups': my_groups, + }, + scoring=weighted_acc) diff --git a/slep006/defs.py b/slep006/defs.py new file mode 100644 index 0000000..26c1d6a --- /dev/null +++ b/slep006/defs.py @@ -0,0 +1,14 @@ +import numpy as np +from sklearn.feature_selection import SelectKBest +from sklearn.linear_model import LogisticRegressionCV +from sklearn.metrics import accuracy_score +from sklearn.metrics import make_scorer +from sklearn.model_selection import GroupKFold, cross_validate +from sklearn.pipeline import make_pipeline + +N, M = 100, 4 +X = np.random.rand(N, M) +y = np.random.randint(0, 2, size=N) +my_groups = np.random.randint(0, 10, size=N) +my_weights = np.random.rand(N) +my_other_weights = np.random.rand(N) diff --git a/slep006/other.rst b/slep006/other.rst new file mode 100644 index 0000000..552e289 --- /dev/null +++ b/slep006/other.rst @@ -0,0 +1,160 @@ +:orphan: + +.. _slep_006_other: + +Alternative solutions to sample-aligned meta-data +================================================= + +This page contains alternative solutions that have been discussed +and finally not considered in the SLEP. + +Solution sketches require these definitions: + +.. literalinclude:: defs.py + +Status quo solution 0a: additional feature +------------------------------------------ + +Without changing scikit-learn, the following hack can be used: + +Additional numeric features representing sample props can be appended to the +data and passed around, being handled specially in each consumer of features +or sample props. + +.. literalinclude:: cases_opt0a.py + +Status quo solution 0b: Pandas Index and global resources +--------------------------------------------------------- + +Without changing scikit-learn, the following hack can be used: + +If `y` is represented with a Pandas datatype, then its index can be used to +access required elements from props stored in a global namespace (or otherwise +made available to the estimator before fitting). This is possible everywhere +that a ground-truth `y` is passed, including fit, split, score, and metrics. +A similar solution with `X` is also possible (except for metrics), if all +Pipeline components retain the original Pandas Index. + +Issues: + +* use of global data source +* requires Pandas data types and indices to be maintained + +.. literalinclude:: cases_opt0b.py + +Solution 1: Pass everything +--------------------------- + +This proposal passes all props to all consumers (estimators, splitters, +scorers, etc). The consumer would optionally use props it is familiar with by +name and disregard other props. + +We may consider providing syntax for the user to control the interpretation of +incoming props: + +* to require that some prop is provided (for an estimator where that prop is + otherwise optional) +* to disregard some provided prop +* to treat a particular prop key as having a certain meaning (e.g. locally + interpreting 'scoring_sample_weight' as 'sample_weight'). + +These constraints would be checked by calling a helper at the consumer. + +Issues: + +* Error handling: if a key is optional in a consumer, no error will be + raised for misspelling. An introspection API might change this, allowing a + user or meta-estimator to check if all keys passed are to be used in at least + one consumer. +* Forwards compatibility: newly supporting a prop key in a consumer will change + behaviour. Other than a ChangedBehaviorWarning, I don't see any way around + this. +* Introspection: not inherently supported. Would need an API like + ``get_prop_support(names: List[str]) -> Dict[str, Literal["supported", "required", "ignored"]]``. + +In short, this is a simple solution, but prone to risk. + +.. literalinclude:: cases_opt1.py + + +Solution 2: Specify routes at call +---------------------------------- + +Similar to the legacy behavior of fit parameters in +:class:`sklearn.pipeline.Pipeline`, this requires the user to specify the +path for each "prop" to follow when calling `fit`. For example, to pass +a prop named 'weights' to a step named 'spam' in a Pipeline, you might use +`my_pipe.fit(X, y, props={'spam__weights': my_weights})`. + +SLEP004's syntax to override the common routing scheme falls under this +solution. + +Advantages: + +* Very explicit and robust to misspellings. + +Issues: + +* The user needs to know the nested internal structure, or it is easy to fail + to pass a prop to a specific estimator. +* A corollary is that prop keys need changing when the developer modifies their + estimator structure (see case C). +* This gets especially tricky or impossible where the available routes + change mid-fit, such as where a grid search considers estimators with + different structures. +* We would need to find a different solution for :issue:`2630` where a Pipeline + could not be the base estimator of AdaBoost because AdaBoost expects the base + estimator to accept a fit param keyed 'sample_weight'. +* This may not work if a meta-estimator were to have the role of changing a + prop, e.g. a meta-estimator that passes `sample_weight` corresponding to + balanced classes onto its base estimator. The meta-estimator would need a + list of destinations to pass modified props to, or a list of keys to modify. +* We would need to develop naming conventions for different routes, which may + be more complicated than the current conventions; while a GridSearchCV + wrapping a Pipeline currently takes parameters with keys like + `{step_name}__{prop_name}`, this explicit routing, and conflict with + GridSearchCV routing destinations, implies keys like + `estimator__{step_name}__{prop_name}`. + +.. literalinclude:: cases_opt2.py + + +Solution 3: Specify routes on metaestimators +-------------------------------------------- + +Each meta-estimator is given a routing specification which it must follow in +passing only the required parameters to each of its children. In this context, +a GridSearchCV has children including `estimator`, `cv` and (each element of) +`scoring`. + +Pull request :pr:`9566` and its extension in :pr:`15425` are partial +implementations of this approach. + +A major benefit of this approach is that it may allow only prop routing +meta-estimators to be modified, not prop consumers. + +All consumers would be required to check that + +Issues: + +* Routing may be hard to get one's head around, especially since the prop + support belongs to the child estimator but the parent is responsible for the + routing. +* Need to design an API for specifying routings. +* As in Solution 2, each local destination for routing props needs to be given + a name. +* Every router along the route will need consistent instructions to pass a + specific prop to a consumer. If the prop is optional in the consumer, routing + failures may be hard to identify and debug. +* For estimators to be cloned, this routing information needs to be cloned with + it. This implies one of: the routing information be stored as a constructor + parameter; or `clone` is extended to explicitly copy routing information. + +Possible public syntax: + +Each meta-estimator has a `prop_routing` parameter to encode local routing +rules, and a set of named children which it routes to. In :pr:`9566`, the +`prop_routing` entry for each child may be a white list or black list of +named keys passed to the meta-estimator. + +.. literalinclude:: cases_opt3.py diff --git a/slep006/proposal.rst b/slep006/proposal.rst new file mode 100644 index 0000000..ddc7c34 --- /dev/null +++ b/slep006/proposal.rst @@ -0,0 +1,282 @@ +.. _slep_006: + +========================= +SLEP006: Metadata Routing +========================= + +:Author: Joel Nothman, Adrin Jalali, Alex Gramfort, Thomas J. Fan +:Status: Accepted +:Type: Standards Track +:Created: 2019-03-07 + +Abstract +-------- + +This SLEP proposes an API to configure estimators, scorers, and CV splitters to +request metadata when calling methods such as `fit`, `predict`, etc. +Meta-estimators or functions that wrap estimators, scorers, or CV splitters will +use this API to pass in the requested metadata. + +Motivation and Scope +-------------------- + +Scikit-learn has limited support for passing around information that is not +`(X, y)`. For example, to pass `sample_weight` to a step of a `Pipeline`, one +needs to specify the step using dunder (`__`) prefixing:: + + >>> pipe = Pipeline([..., ('clf', LogisticRegression())]) + >>> pipe.fit(X, y, clf__sample_weight=sample_weight) + +Several other meta-estimators, such as `GridSearchCV`, support forwarding these +fit parameters to their base estimator when fitting. Yet a number of important +use cases are currently not supported: + +* Passing metadata (e.g. `sample_weight`) to a scorer used in cross-validation +* Passing metadata (e.g. `groups`) to a CV splitter in nested cross-validation +* Passing metadata (e.g. `sample_weight`) to some scorers and not others in + multi-metric cross-validation. This is also required to handle fairness + related metrics which usually expect one or more sensitive attributes to be + passed to them along with the data. +* Passing metadata to non-`fit` methods. For example, passing group indices for + samples that are to be treated as a single sequence in prediction, or passing + sensitive attributes to `predict` or `transform` of a fairness related + estimator. + +We define the following terms in this proposal: + +* **consumer**: An object that receives and consumes metadata, such as + estimators, scorers, or CV splitters. + +* **router**: An object that passes metadata to a **consumer** or + another **router**. Examples of **routers** include meta-estimators or + functions. (For example `GridSearchCV` or `cross_validate` route sample + weights, cross validation groups, etc. to **consumers**) + +This SLEP proposes to add + +* `get_metadata_routing` to all **consumers** and **routers** + (i.e. all estimators, scorers, and splitters supporting this API) +* `set_*_request` to consumers (including estimators, scorers, and CV + splitters), where `*` is a method that requires metadata. (e.g. + `set_fit_request`, `set_score_request`, `set_transform_request`, etc.) + +For example, `set_fit_request` configures an estimator to request metadata:: + + >>> log_reg = LogisticRegression().set_fit_request(sample_weight=True) + +`get_metadata_routing` are used by **routers** to inspect the metadata needed +by **consumers**. `get_metadata_routing` returns a `MetadataRouter` or a +`MetadataRequest` object that stores and handles metadata routing. +`get_metadata_routing` returns enough information for a router to know what +metadata is requested, and whether the metadata is sample aligned or not. See +the draft implementation for more implementation details. + +Note that in the core library nothing is requested by default, except +``groups`` in ``Group*CV`` objects which request the ``groups`` metadata. At +the time of writing this proposal, all metadata requested in the core library +are sample aligned. + +Also note that ``X``, ``y``, and ``Y`` input arguments are never automatically +added to the routing mechanism and are always passed into their respective +methods. + +Detailed description +-------------------- + +This SLEP unlocks many machine learning use cases that were not possible +before. In this section, we will focus on some workflows that are made possible +by this SLEP. + +Nested Grouped Cross Validation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following examples demonstrates nested grouped cross validation +where a scorer and an estimator requests `sample_weight` and `GroupKFold` +requests `groups` by default:: + + >>> weighted_acc = make_scorer(accuracy_score).score_request(sample_weight=True) + >>> log_reg = (LogisticRegressionCV(cv=GroupKFold(), scoring=weighted_acc) + ... .set_fit_request(sample_weight=True)) + >>> cv_results = cross_validate( + ... log_reg, X, y, + ... cv=GroupKFold(), + ... metadata={"sample_weight": my_weights, "groups": my_groups}, + ... scoring=weighted_acc) + +To support unweighted fitting and weighted scoring, metadata is set to `False` +in `fit_request`:: + + >>> log_reg = (LogisticRegressionCV(cv=group_cv, scoring=weighted_acc) + ... .fit_request(sample_weight=False)) + >>> cross_validate( + ... log_reg, X, y, + ... cv=GroupKFold(), + ... metadata={'sample_weight': weights, 'groups': groups}, + ... scoring=weighted_acc) + +Unweighted Feature selection +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Consumers** that do not accept weights during fitting such as `SelectKBest` +will _not_ be routed weights:: + + >>> log_reg = (LogisticRegressionCV(cv=GroupKFold(), scoring=weighted_acc) + ... .set_fit_request(sample_weight=True)) + >>> sel = SelectKBest(k=2) + >>> pipe = make_pipeline(sel, log_reg) + >>> pipe.fit(X, y, sample_weight=weights, groups=groups) + +Note that if a **consumer** or a **router** starts accepting and consuming a +certain metadata, the developer API enables developers to raise a warning +and avoid silent behavior changes in users' code. See the draft implementation +for more details. + +Different Scoring and Fitting Weights +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We can pass different weights for scoring and fitting by using an aliases. In +this example, `scoring_weight` is passed to the scoring and `fitting_weight` +is passed to `LogisticRegressionCV`:: + + >>> weighted_acc = (make_scorer(accuracy_score) + ... .set_score_request(sample_weight="scoring_weight")) + >>> log_reg = (LogisticRegressionCV(cv=GroupKFold(), scoring=weighted_acc) + ... .set_fit_request(sample_weight="fitting_weight")) + >>> cv_results = cross_validate( + ... log_reg, X, y, + ... cv=GroupKFold(), + ... metadata={"scoring_weight": my_weights, + ... "fitting_weight": my_other_weights, + ... "groups": my_groups}, + ... scoring=weighted_acc) + +Nested Grouped Cross Validation with SearchCV +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since `GroupKFold` requests group metadata by default, `GroupKFold` instances can +be passed to multiple **routers** to enable nested grouped cross validation. In +this example, both `RandomizedSearchCV` and `cross_validate` set +`cv=GroupKFold()` which enables grouped CV in the outer loop (`cross_validate`) +and the inner random search:: + + >>> log_reg = LogisticRegression() + >>> distributions = {"C": uniform(loc=0, scale=4), + ... "penalty": ['l2', 'l1']} + >>> random_search = RandomizedSearchCV(log_reg, distributions, cv=GroupKFold()) + >>> cv_results = cross_validate( + ... log_reg, X, y, + ... cv=GroupKFold(), + ... metadata={"groups": my_groups}) + +Implementation +-------------- + +This SLEP has a draft implementation at :pr:`22083` by :user:`adrinjalali`. The +implementation provides developer utilities that are used by scikit-learn and +available to third-party estimators for adopting this SLEP. Specifically, the +draft implementation makes it easier to define `get_metadata_routing` and +`set_*_request` for **consumers** and **routers**. + +Backward compatibility +---------------------- + +Scikit-learn's meta-estimators will deprecate the dunder (`__`) syntax for +routing and enforce explicit request method calls. During the deprecation +period, using dunder syntax routing and explicit request calls together will +raise an error. + +During the deprecation period, meta-estimators such as `GridSearchCV` will +route `fit_params` to the inner estimators' `fit` by default, but +a deprecation warning is raised:: + + >>> # Deprecation warning, stating that the provided metadata is not requested + >>> GridSearchCV(LogisticRegression(), ...).fit(X, y, sample_weight=sw) + +To avoid the warning, one would need to specify the request in +`LogisticRegression`:: + + >>> grid = GridSearchCV( + ... LogisticRegression().set_fit_request(sample_weight=True), ... + ... ) + >>> grid.fit(X, y, sample_weight=sw) + +Meta-estimators such as `GridSearchCV` will check which metadata is requested, +and will error when metadata is passed in and the inner estimator is +not configured to request it:: + + >>> weighted_acc = make_scorer(accuracy_score).score_request(sample_weight=True) + >>> log_reg = LogisticRegression() + >>> grid = GridSearchCV(log_reg, ..., scoring=weighted_scorer) + >>> + >>> # Raise a TypeError that log_reg is not specified with any routing + >>> # metadata for `sample_weight`, but sample_weight has been passed in to + >>> # `grid.fit`. + >>> grid.fit(X, y, sample_weight=sw) + +To avoid the error, `LogisticRegression` must specify its metadata request by +calling `set_fit_request`:: + + >>> # Request sample weights + >>> log_reg_weights = LogisticRegression().set_fit_request(sample_weight=True) + >>> grid = GridSearchCV(log_reg_with_weights, ...) + >>> grid.fit(X, y, sample_weight=sw) + >>> + >>> # Do not request sample_weights + >>> log_reg_no_weights = LogisticRegression().set_fit_request(sample_weight=False) + >>> grid = GridSearchCV(log_reg_no_weights, ...) + >>> grid.fit(X, y, sample_weight=sw) + +Note that a meta-estimator will raise an error if the user passes a metadata +which is not requested by any of the child objects of the meta-estimator. + +Third-party estimators will need to adopt this SLEP in order to support +metadata routing, while the dunder syntax is deprecated. Our implementation +will provide developer APIs to trigger warnings and errors as described above +to help with adopting this SLEP. + +Alternatives +------------ + +Over the years, there have been many proposed alternatives before we landed +on this SLEP: + +* :pr:`4696` A first implementation by :user:`amueller` +* `Discussion towards SLEP004 + <https://github.com/scikit-learn/enhancement_proposals/pull/6>`__ initiated + by :user:`tguillemot`. +* :pr:`9566` Another implementation (solution 3 from this SLEP) + by :user:`jnothman` +* This SLEP has emerged from many alternatives detailed at + :ref:`slep_006_other`. + +Discussion & Related work +------------------------- + +This SLEP was drafted based on the discussions of potential solutions +at the February 2019 development sprint in Paris. The overarching issue is +found at "Consistent API for attaching properties to samples" at :issue:`4497`. + +Related issues and discussions include: :issue:`1574`, :issue:`2630`, +:issue:`3524`, :issue:`4632`, :issue:`4652`, :issue:`4660`, :issue:`4696`, +:issue:`6322`, :issue:`7112`, :issue:`7646`, :issue:`7723`, :issue:`8127`, +:issue:`8158`, :issue:`8710`, :issue:`8950`, :issue:`11429`, :issue:`12052`, +:issue:`15282`, :issue:`15370`, :issue:`15425`, :issue:`18028`. + +One benefit of the explicitness in this proposal is that even if it makes use +of `**kwarg` arguments, it does not preclude keywords arguments serving other +purposes. In addition to requesting sample metadata, a future proposal could +allow estimators to request feature metadata or other keys. + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_ diff --git a/slep007/proposal.rst b/slep007/proposal.rst new file mode 100644 index 0000000..7f9185d --- /dev/null +++ b/slep007/proposal.rst @@ -0,0 +1,289 @@ + .. _slep_007: + +==================================================== +SLEP007: Feature names, their generation and the API +==================================================== + +:Author: Adrin Jalali +:Status: Final +:Type: Standards Track +:Created: 2019-04 +:Vote opened: 2021-10-26 +:Vote closed: 2021-11-29 + +Implemented with `v1.1.0 <https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_1_1_0.html#get-feature-names-out-available-in-all-transformers>`__. + +Abstract +######## + +This SLEP proposes the introduction of the ``feature_names_in_`` attribute for +all estimators, and the ``get_feature_names_out`` method for all transformers. +We here discuss the generation of such attributes and their propagation through +pipelines. Since for most estimators there are multiple ways to generate +feature names, this SLEP does not intend to define how exactly feature names +are generated for all of them. + +Motivation +########## + +``scikit-learn`` has been making it easier to build complex workflows with the +``ColumnTransformer`` and it has been seeing widespread adoption. However, +using it results in pipelines where it's not clear what the input features to +the final predictor are, even more so than before. For example, after fitting +the following pipeline, users should ideally be able to inspect the features +going into the final predictor:: + + + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) + + # We will train our classifier with the following features: + # Numeric Features: + # - age: float. + # - fare: float. + # Categorical Features: + # - embarked: categories encoded as strings {'C', 'S', 'Q'}. + # - sex: categories encoded as strings {'female', 'male'}. + # - pclass: ordinal integers {1, 2, 3}. + + # We create the preprocessing pipelines for both numeric and categorical data. + numeric_features = ['age', 'fare'] + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler())]) + + categorical_features = ['embarked', 'sex', 'pclass'] + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features)]) + + # Append classifier to preprocessing pipeline. + # Now we have a full prediction pipeline. + clf = Pipeline(steps=[('preprocessor', preprocessor), + ('classifier', LogisticRegression())]) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + clf.fit(X_train, y_train) + + +However, it's impossible to interpret or even sanity-check the +``LogisticRegression`` instance that's produced in the example, because the +correspondence of the coefficients to the input features is basically +impossible to figure out. + +This proposal suggests adding ``feature_names_in_`` attribute and +``get_feature_names_out`` method to fitted estimators: , such that in the +abovementioned example ``clf[-1].feature_names_in_`` and +``clf[-2].get_feature_names_out()`` will be:: + + ['num__age', + 'num__fare', + 'cat__embarked_C', + 'cat__embarked_Q', + 'cat__embarked_S', + 'cat__embarked_missing', + 'cat__sex_female', + 'cat__sex_male', + 'cat__pclass_1', + 'cat__pclass_2', + 'cat__pclass_3'] + +Ideally the generated feature names describe how a feature is generated at each +stage of a pipeline. For instance, ``cat__sex_female`` shows that the feature +has been through a categorical preprocessing pipeline, was originally the +column ``sex``, and has been one hot encoded and is one if it was originally +``female``. However, this is not always possible or desirable especially when a +generated column is based on many columns, since the generated feature names +will be too long, for example in ``PCA``. As a rule of thumb, the following +types of transformers may generate feature names which corresponds to the +original features: + +- Leave columns unchanged, *e.g.* ``StandardScaler`` +- Select a subset of columns, *e.g.* ``SelectKBest`` +- create new columns where each column depends on at most one input column, + *e.g* ``OneHotEncoder`` +- Algorithms that create combinations of a fixed number of features, *e.g.* + ``PolynomialFeatures``, as opposed to all of + them where there are many. Note that verbosity considerations and + ``verbose_feature_names_out`` as explained later can apply here. + +This proposal talks about how feature names are generated and not how they are +propagated. + +Scope +##### + +The API for input and output feature names includes a ``feature_names_in_`` +attribute for all estimators, and a ``get_feature_names_out`` method for any +estimator with a ``transform`` method, *i.e.* they expose the generated feature +names via the ``get_feature_names_out`` method. + +Note that this SLEP also applies to `resamplers +<https://github.com/scikit-learn/enhancement_proposals/pull/15>`_ the same way +as transformers. + +Input Feature Names +################### + +The input feature names are stored in a fitted estimator in a +``feature_names_in_`` attribute, and are taken from the given input data, for +instance a ``pandas`` data frame. This attribute will be ``None`` if the input +provides no feature names. The ``feature_names_in_`` attribute is a 1d NumPy +array with object dtype and all elements in the array are strings. + +Output Feature Names +#################### + +A fitted estimator exposes the output feature names through the +``get_feature_names_out`` method. The output of ``get_feature_names_out`` is a +1d NumPy array with object dtype and all elements in the array are strings. Here +we discuss more in detail how these feature names are generated. Since for most +estimators there are multiple ways to generate feature names, this SLEP does not +intend to define how exactly feature names are generated for all of them. It is +instead a guideline on how they could generally be generated. + +As detailed bellow, some generated output features names are the same or a +derived from the input feature names. In such cases, if no input feature names +are provided, ``x0`` to ``xn`` are assumed to be their names. + +Feature Selector Transformers +***************************** + +This includes transformers which output a subset of the input features, w/o +changing them. For example, if a ``SelectKBest`` transformer selects the first +and the third features, and no names are provided, the +``get_feature_names_out`` will be ``[x0, x2]``. + +Feature Generating Transformers +******************************* + +The simplest category of transformers in this section are the ones which +generate a column based on a single given column. These would simply +preserve the input feature names if a single new feature is generated, +such as in ``StandardScaler``, which would map ``'age'`` to ``'age'``. +If an input feature maps to multiple new +features, a postfix is added, so that ``OneHotEncoder`` might map +``'gender'`` to ``'gender_female'`` ``'gender_fluid'`` etc. + +Transformers where each output feature depends on a fixed number of input +features may generate descriptive names as well. For instance, a +``PolynomialTransformer`` on a small subset of features can generate an output +feature name such as ``x[0] * x[2] ** 3``. + +And finally, the transformers where each output feature depends on many or all +input features, generate feature names which has the form of ``name0`` to +``namen``, where ``name`` represents the transformer. For instance, a ``PCA`` +transformer will output ``[pca0, ..., pcan]``, ``n`` being the number of PCA +components. + +Meta-Estimators +*************** + +Meta estimators can choose to prefix the output feature names given by the +estimators they are wrapping or not. + +By default, ``Pipeline`` adds no prefix, *i.e* its ``get_feature_names_out()`` +is the same as the ``get_feature_names_out()`` of the last step, and ``None`` +if the last step is not a transformer. + +``ColumnTransformer`` by default adds a prefix to the output feature names, +indicating the name of the transformer applied to them. If a column is in the output +as a part of ``passthrough``, it won't be prefixed since no operation has been +applied on it. + +Examples +######## + +Here we include some examples to demonstrate the behavior of output feature +names:: + + 100 features (no names) -> PCA(n_components=3) + get_feature_names_out(): [pca0, pca1, pca2] + + + 100 features (no names) -> SelectKBest(k=3) + get_feature_names_out(): [x2, x17, x42] + + + [f1, ..., f100] -> SelectKBest(k=3) + get_feature_names_out(): [f2, f17, f42] + + + [cat0] -> OneHotEncoder() + get_feature_names_out(): [cat0_cat, cat0_dog, ...] + + + [f1, ..., f100] -> Pipeline( + [SelectKBest(k=30), + PCA(n_components=3)] + ) + get_feature_names_out(): [pca0, pca1, pca2] + + + [model, make, numeric0, ..., numeric100] -> + ColumnTransformer( + [('cat', Pipeline(SimpleImputer(), OneHotEncoder()), + ['model', 'make']), + ('num', Pipeline(SimpleImputer(), PCA(n_components=3)), + ['numeric0', ..., 'numeric100'])] + ) + get_feature_names_out(): ['cat_model_100', 'cat_model_200', ..., + 'cat_make_ABC', 'cat_make_XYZ', ..., + 'num_pca0', 'num_pca1', 'num_pca2'] + +However, the following examples produce a somewhat redundant feature names:: + + [model, make, numeric0, ..., numeric100] -> + ColumnTransformer([ + ('ohe', OneHotEncoder(), ['model', 'make']), + ('pca', PCA(n_components=3), ['numeric0', ..., 'numeric100']) + ]) + get_feature_names_out(): ['ohe_model_100', 'ohe_model_200', ..., + 'ohe_make_ABC', 'ohe_make_XYZ', ..., + 'pca_pca0', 'pca_pca1', 'pca_pca2'] + +Extensions +########## + +verbose_feature_names_out +************************* + +To provide more control over feature names, we could add a boolean +``verbose_feature_names_out`` constructor argument to certain transformers. +The default would reflect the description above, but changes would allow more verbose +names in some transformers, say having ``StandardScaler`` map ``'age'`` to ``'scale(age)'``. + +In case of the ``ColumnTransformer`` example above ``verbose_feature_names_out`` +could remove the estimator names, leading to shorter and less redundant names:: + + [model, make, numeric0, ..., numeric100] -> + make_column_transformer( + (OneHotEncoder(), ['model', 'make']), + (PCA(n_components=3), ['numeric0', ..., 'numeric100']), + verbose_feature_names_out=False + ) + get_feature_names_out(): ['model_100', 'model_200', ..., + 'make_ABC', 'make_XYZ', ..., + 'pca0', 'pca1', 'pca2'] + +Alternative solutions to a boolean flag could include: + +- an integer: fine tuning the verbosity of the generated feature names. +- a ``callable`` which would give further flexibility to the user to generate + user defined feature names. + +These alternatives may be discussed and implemented in the future if deemed +necessary. + +Backward Compatibility +###################### + +All estimators should implement the ``feature_names_in_`` and +``get_feature_names_out()`` API. This is checked in ``check_estimator``, and +the transition is done with a ``FutureWarning`` for at least two versions to +give time to third party developers to implement the API. diff --git a/slep009/proposal.rst b/slep009/proposal.rst new file mode 100644 index 0000000..c6ca4e3 --- /dev/null +++ b/slep009/proposal.rst @@ -0,0 +1,222 @@ +.. _slep_009: + +=============================== +SLEP009: Keyword-only arguments +=============================== + +:Author: Adrin Jalali +:Status: Final +:Type: Standards Track +:Created: 2019-07-13 +:Vote opened: 2019-09-11 + +Implemented with `v0.23 <https://scikit-learn.org/stable/whats_new/v0.23.html#enforcing-keyword-only-arguments>`__ +and `v1.0.0 <https://scikit-learn.org/stable/whats_new/v1.0.html#enforcing-keyword-only-arguments>`__. + +Abstract +######## + +This proposal discusses the path to gradually forcing users to pass arguments, +or most of them, as keyword arguments only. It talks about the status-quo, and +the motivation to introduce the change. It shall cover the pros and cons of the +change. The original issue starting the discussion is located +`here <https://github.com/scikit-learn/scikit-learn/issues/12805>`__. + +Motivation +########## + +At the moment `sklearn` accepts all arguments both as positional and +keyword arguments. For example, both of the following are valid: + +.. code-block:: python + + # positional arguments + clf = svm.SVC(.1, 'rbf') + # keyword arguments + clf = svm.SVC(C=.1, kernel='rbf') + + +Using keyword arguments has a few benefits: + +- It is more readable. +- For models which accept many parameters, especially numerical, it is less + error-prone than positional arguments. Compare these examples: + +.. code-block:: python + + cls = cluster.OPTICS( + min_samples=5, max_eps=inf, metric=’minkowski’, p=2, + metric_params=None, cluster_method=’xi’, eps=None, xi=0.05, + predecessor_correction=True, min_cluster_size=None, algorithm=’auto’, + leaf_size=30, n_jobs=None) + + cls = cluster.OPTICS(5, inf, ’minkowski’, 2, None, ’xi’, None, 0.05, + True, None, ’auto’, 30, None) + + +- It allows adding new parameters closer the other relevant parameters, instead + of adding new ones at the end of the list without breaking backward + compatibility. Right now all new parameters are added at the end of the + signature. Once we move to a keyword only argument list, we can change their + order and put related parameters together. Assuming at some point numpydoc + would support sections for parameters, these groups of parameters would be in + different sections for the documentation to be more readable. Also, note that + we have previously assumed users would pass most parameters by name and have + sometimes considered changes to be backwards compatible when they modified + the order of parameters. For example, user code relying on positional + arguments could break after a deprecated parameter was removed. Accepting + this SLEP would make this requirement explicit. + +Solution +######## + +The official supported way to have keyword only arguments is: + +.. code-block:: python + + def func(arg1, arg2, *, arg3, arg4) + +Which means the function can only be called with `arg3` and `arg4` specified +as keyword arguments: + +.. code-block:: python + + func(1, 2, arg3=3, arg4=4) + +The feature was discussed and the related PEP +`PEP3102 <https://www.python.org/dev/peps/pep-3102/>`_ was accepted and +introduced in Python 3.0, in 2006. + +For the change to happen in ``sklearn``, we would need to add the ``*`` where +we want all subsequent parameters to be passed as keyword only. + +Considerations +############## + +We can identify the following main challenges: familiarity of the users with +the syntax, and its support by different IDEs. + +Syntax +------ + +Partly due to the fact that the Scipy/PyData has been supporting Python 2 until +recently, the feature (among other Python 3 features) has seen limited adoption +and the users may not be used to seeing the syntax. The similarity between the +following two definitions may also be confusing to some users: + +.. code-block:: python + + def f(arg1, *arg2, arg3): pass # variable length arguments at arg2 + + def f(arg1, *, arg3): pass # no arguments accepted at * + +However, some other teams are already moving towards using the syntax, such as +``matplotlib`` which has introduced the syntax with a deprecation cycle using a +decorator for this purpose in version 3.1. The related PRs can be found `here +<https://github.com/matplotlib/matplotlib/pull/13601>`__ and `here +<https://github.com/matplotlib/matplotlib/pull/14130>`__. Soon users will be +familiar with the syntax. + +IDE Support +----------- + +Many users rely on autocomplete and parameter hints of the IDE while coding. +Here is how the hint looks like in two different IDEs. For instance, for the +above function, defined in VSCode, the hint would be shown as: + +.. code-block:: python + + func(arg1, arg2, *, arg3, arg4) + + param arg3 + func(1, 2, |) + +The good news is that the IDE understands the syntax and tells the user it's +the ``arg3``'s turn. But it doesn't say it is a keyword only argument. + +`ipython`, however, suggests all parameters be given with the keyword anyway: + +.. code-block:: python + + In [1]: def func(arg1, arg2, *, arg3, arg4): pass + + In [2]: func( + abs() arg3= + all() arg4= + any() ArithmeticError > + arg1= ascii() + arg2= AssertionError + +Scope +##### + +An important open question is which functions/methods and/or parameters should +follow this pattern, and which parameters should be keyword only. We can +identify the following categories of functions/methods: + +- ``__init__`` +- Main methods of the API, *i.e.* ``fit``, ``transform``, etc. +- All other methods, *e.g.* ``SpectralBiclustering.get_submatrix`` +- Functions + +With regard to the common methods of the API, the decision for these methods +should be the same throughout the library in order to keep a consistent +interface to the user. + +This proposal suggests making only *most commonly* used parameters positional. +The *most commonly* used parameters are defined per method or function, to be +defined as either of the following two ways: + +- The set defined and agreed upon by the core developers, which should cover + the *easy* cases. +- A set identified as being in the top 95% of the use cases, using some + automated analysis such as `this one + <https://odyssey.readthedocs.io/en/latest/tutorial.html>`__ or `this one + <https://github.com/Quansight-Labs/python-api-inspect>`__. + +This way we would minimize the number of warnings the users would receive, +which minimizes the friction cause by the change. This SLEP does not define +these parameter sets, and the respective decisions shall be made in their +corresponding pull requests. + +Deprecation Path +---------------- + +For a smooth transition, we need an easy deprecation path. Similar to the +decorators developed in ``matplotlib``, a proposed solution is available at +[#13311](https://github.com/scikit-learn/scikit-learn/pull/13311), which +deprecates the usage of positional arguments on selected functions and methods. +With the decorator, the user sees a warning if they pass the designated +keyword-only arguments as positional, and removing the decorator would result +in an error. Examples (borrowing from the PR): + +.. code-block:: python + + @warn_args + def dbscan(X, eps=0.5, *, min_samples=4, metric='minkowski'): + pass + + + class LogisticRegression: + + @warn_args + def __init__(self, penalty='l2', *, dual=False): + + self.penalty = penalty + self.dual = dual + + +Calling ``LogisticRegression('l2', True)`` will result with a +``DeprecationWarning``: + +.. code-block:: bash + + Should use keyword args: dual=True + + +Once the deprecation period is over, we'd remove the decorator and calling +the function/method with the positional arguments after `*` would fail. + +The final decorator solution shall make sure it is well understood by most +commonly used IDEs and editors such as IPython, Jupiter Lab, Emacs, vim, +VSCode, and PyCharm. diff --git a/slep010/proposal.rst b/slep010/proposal.rst new file mode 100644 index 0000000..8970b68 --- /dev/null +++ b/slep010/proposal.rst @@ -0,0 +1,111 @@ +.. _slep_010: + +===================================== +SLEP010: ``n_features_in_`` attribute +===================================== + +:Author: Nicolas Hug +:Status: Final +:Type: Standards Track +:Created: 2019-11-23 + +Implemented with `v0.23 <https://scikit-learn.org/stable/whats_new/v0.23.html?highlight=n_features_in_#id13>`__. + +Abstract +######## + +This SLEP proposes the introduction of a public ``n_features_in_`` attribute +for most estimators (where relevant). + +Motivation +########## + +Knowing the number of features that an estimator expects is useful for +inspection purposes. This is also useful for implementing the feature names +propagation (`SLEP 8 +<https://github.com/scikit-learn/enhancement_proposals/pull/18>`_) . For +example any of the scaler can easily create feature names if they know +``n_features_in_``. + +Solution +######## + +The proposed solution is to replace most calls to ``check_array()`` or +``check_X_y()`` by calls to a newly created private method:: + + def _validate_data(self, X, y=None, reset=True, **check_array_params) + ... + +The ``_validate_data()`` method will call ``check_array()`` or +``check_X_y()`` function depending on the ``y`` parameter. + +If the ``reset`` parameter is True (default), the method will set the +``n_feature_in_`` attribute of the estimator, regardless of its potential +previous value. This should typically be used in ``fit()``, or in the first +``partial_fit()`` call. Passing ``reset=False`` will not set the attribute but +instead check against it, and potentially raise an error. This should typically +be used in ``predict()`` or ``transform()``, or on subsequent calls to +``partial_fit``. + +In most cases, the ``n_features_in_`` attribute exists only once ``fit`` has +been called, but there are exceptions (see below). + +A new common check is added: it makes sure that for most estimators, the +``n_features_in_`` attribute does not exist until ``fit`` is called, and +that its value is correct. Instead of raising an exception, this check will +raise a warning for the next two releases. This will give downstream +packages some time to adjust (see considerations below). + +Since the introduced method is private, third party libraries are +recommended not to rely on it. + +The logic that is proposed here (calling a stateful method instead of a +stateless function) is a pre-requisite to fixing the dataframe column +ordering issue: with a stateless ``check_array``, there is no way to raise +an error if the column ordering of a dataframe was changed between ``fit`` +and ``predict``. This is however out os scope for this SLEP, which only focuses +on the introduction of the ``n_features_in_`` attribute. + +Considerations +############## + +The main consideration is that the addition of the common test means that +existing estimators in downstream libraries will not pass our test suite, +unless the estimators also have the ``n_features_in_`` attribute. + +The newly introduced checks will only raise a warning instead of an exception +for the next 2 releases, so this will give more time for downstream packages +to adjust. + +There are other minor considerations: + +- In most meta-estimators, the input validation is handled by the + sub-estimator(s). The ``n_features_in_`` attribute of the meta-estimator + is thus explicitly set to that of the sub-estimator, either via a + ``@property``, or directly in ``fit()``. +- Some estimators like the dummy estimators do not validate the input + (the 'no_validation' tag should be True). The ``n_features_in_`` attribute + should be set to None, though this is not enforced in the common check. +- Some estimators expect a non-rectangular input: the vectorizers. These + estimators expect dicts or lists, not a ``n_samples * n_features`` matrix. + ``n_features_in_`` makes no sense here and these estimators just don't have + the attribute. +- Some estimators may know the number of input features before ``fit`` is + called: typically the ``SparseCoder``, where ``n_feature_in_`` is known at + ``__init__`` from the ``dictionary`` parameter. In this case the attribute + is a property and is available right after object instantiation. + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. + +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_ diff --git a/slep012/proposal.rst b/slep012/proposal.rst new file mode 100644 index 0000000..aad4a46 --- /dev/null +++ b/slep012/proposal.rst @@ -0,0 +1,135 @@ +.. _slep_012: + +======================= +SLEP012: ``InputArray`` +======================= + +:Author: Adrin jalali +:Status: Withdrawn (superseded by :ref:`SLEP007 <slep_007>` and :ref:`SLEP018 <slep_018>`) +:Type: Standards Track +:Created: 2019-12-20 + +Motivation +########## + +This proposal results in a solution to propagating feature names through +transformers, pipelines, and the column transformer. Ideally, we would have:: + + df = pd.readcsv('tabular.csv') + # transforming the data in an arbitrary way + transformer0 = ColumnTransformer(...) + # a pipeline preprocessing the data and then a classifier (or a regressor) + clf = make_pipeline(transformer0, ..., SVC()) + + # now we can investigate features at each stage of the pipeline + clf[-1].input_feature_names_ + +The feature names are propagated throughout the pipeline and the user can +investigate them at each step of the pipeline. + +This proposal suggests adding a new data structure, called ``InputArray``, +which augments the data array ``X`` with additional meta-data. In this proposal +we assume the feature names (and other potential meta-data) are attached to the +data when passed to an estimator. Alternative solutions are discussed later in +this document. + +A main constraint of this data structure is that is should be backward +compatible, *i.e.* code which expects a ``numpy.ndarray`` as the output of a +transformer, would not break. This SLEP focuses on *feature names* as the only +meta-data attached to the data. Support for other meta-data can be added later. + +Backward/NumPy/Pandas Compatibility +################################### + +Since currently transformers return a ``numpy`` or a ``scipy`` array, backward +compatibility in this context means the operations which are valid on those +arrays should also be valid on the new data structure. + +All operations are delegated to the *data* part of the container, and the +meta-data is lost immediately after each operation and operations result in a +``numpy.ndarray``. This includes indexing and slicing, *i.e.* to avoid +performance degradation, ``__getitem__`` is not overloaded and if the user +wishes to preserve the meta-data, they shall do so via explicitly calling a +method such as ``select()``. Operations between two ``InpuArrays`` will not +try to align rows and/or columns of the two given objects. + +``pandas`` compatibility comes ideally as a ``pd.DataFrame(inputarray)``, for +which ``pandas`` does not provide a clean API at the moment. Alternatively, +``inputarray.todataframe()`` would return a ``pandas.DataFrame`` with the +relevant meta-data attached. + +Feature Names +############# + +Feature names are an object ``ndarray`` of strings aligned with the columns. +They can be ``None``. + +Operations +########## + +Estimators understand the ``InputArray`` and extract the feature names from the +given data before applying the operations and transformations on the data. + +All transformers return an ``InputArray`` with feature names attached to it. +The way feature names are generated is discussed in *SLEP007 - The Style of The +Feature Names*. + +Sparse Arrays +############# + +Ideally sparse arrays follow the same pattern, but since ``scipy.sparse`` does +not provide the kinda of API provided by ``numpy``, we may need to find +compromises. + +Factory Methods +############### + +There will be factory methods creating an ``InputArray`` given a +``pandas.DataFrame`` or an ``xarray.DataArray`` or simply an ``np.ndarray`` or +an ``sp.SparseMatrix`` and a given set of feature names. + +An ``InputArray`` can also be converted to a ``pandas.DataFrame`` using a +``todataframe()`` method. + +``X`` being an ``InputArray``:: + + >>> np.array(X) + >>> X.todataframe() + >>> pd.DataFrame(X) # only if pandas implements the API + +And given ``X`` a ``np.ndarray`` or an ``sp.sparse`` matrix and a set of +feature names, one can make the right ``InputArray`` using:: + + >>> make_inputarray(X, feature_names) + +Alternative Solutions +##################### + +Since we expect the feature names to be attached to the data given to an +estimator, there are a few potential approaches we can take: + +- ``pandas`` in, ``pandas`` out: this means we expect the user to give the data + as a ``pandas.DataFrame``, and if so, the transformer would output a + ``pandas.DataFrame`` which also includes the [generated] feature names. This + is not a feasible solution since ``pandas`` plans to move to a per column + representation, which means ``pd.DataFrame(np.asarray(df))`` has two + guaranteed memory copies. +- ``XArray``: we could accept a ``pandas.DataFrame``, and use + ``xarray.DataArray`` as the output of transformers, including feature names. + However, ``xarray`` has a hard dependency on ``pandas``, and uses + ``pandas.Index`` to handle row labels and aligns rows when an operation + between two ``xarray.DataArray`` is done, which can be time consuming, and is + not the semantic expected in ``scikit-learn``; we only expect the number of + rows to be equal, and that the rows always correspond to one another in the + same order. + +As a result, we need to have another data structure which we'll use to transfer +data related information (such as feature names), which is lightweight and +doesn't interfere with existing user code. + +Another alternative to the problem of passing meta-data around is to pass that +as a parameter to ``fit``. This would heavily involve modifying meta-estimators +since they'd need to pass that information, and extract the relevant +information from the estimators to pass that along to the next estimator. Our +prototype implementations showed significant challenges compared to when the +meta-data is attached to the data. diff --git a/slep013/proposal.rst b/slep013/proposal.rst new file mode 100644 index 0000000..0fb0b2d --- /dev/null +++ b/slep013/proposal.rst @@ -0,0 +1,120 @@ +.. _slep_013: + +====================================== +SLEP013: ``n_features_out_`` attribute +====================================== + +:Author: Adrin Jalali +:Status: Rejected +:Type: Standards Track +:Created: 2020-02-12 + +Abstract +######## + +This SLEP proposes the introduction of a public ``n_features_out_`` attribute +for most transformers (where relevant). + +Motivation +########## + +Knowing the number of features that a transformer outputs is useful for +inspection purposes. This is in conjunction with `*SLEP010: ``n_features_in_``* +<https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_. + +Take the following piece as an example:: + + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) + + # We will train our classifier with the following features: + # Numeric Features: + # - age: float. + # - fare: float. + # Categorical Features: + # - embarked: categories encoded as strings {'C', 'S', 'Q'}. + # - sex: categories encoded as strings {'female', 'male'}. + # - pclass: ordinal integers {1, 2, 3}. + + # We create the preprocessing pipelines for both numeric and categorical data. + numeric_features = ['age', 'fare'] + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler())]) + + categorical_features = ['embarked', 'sex', 'pclass'] + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features)]) + + # Append classifier to preprocessing pipeline. + # Now we have a full prediction pipeline. + clf = Pipeline(steps=[('preprocessor', preprocessor), + ('classifier', LogisticRegression())]) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + clf.fit(X_train, y_train) + +The user could then inspect the number of features going out from each step:: + + # Total number of output features from the `ColumnTransformer` + clf[0].n_features_out_ + + # Number of features as a result of the numerical pipeline: + clf[0].named_transformers_['num'].n_features_out_ + + # Number of features as a result of the categorical pipeline: + clf[0].named_transformers_['cat'].n_features_out_ + +Solution +######## + +The proposed solution is for the ``n_features_out_`` attribute to be set once a +call to ``fit`` is done. In many cases the value of ``n_features_out_`` is the +same as some other attribute stored in the transformer, *e.g.* +``n_components_``, and in these cases a ``Mixin`` such as a ``ComponentsMixin`` +can delegate ``n_features_out_`` to those attributes. + +Testing +------- + +A test to the common tests is added to ensure the presence of the attribute or +property after calling ``fit``. + +Considerations +############## + +The main consideration is that the addition of the common test means that +existing estimators in downstream libraries will not pass our test suite, +unless the estimators also have the ``n_features_out_`` attribute. + +The newly introduced checks will only raise a warning instead of an exception +for the next 2 releases, so this will give more time for downstream packages +to adjust. + +There are other minor considerations: + +- In some meta-estimators, this is delegated to the + sub-estimator(s). The ``n_features_out_`` attribute of the meta-estimator is + thus explicitly set to that of the sub-estimator, either via a ``@property``, + or directly in ``fit()``. +- Some transformers such as ``FunctionTransformer`` may not know the number + of output features since arbitrary arrays can be passed to `transform`. In + such cases ``n_features_out_`` is set to ``None``. + +Copyright +--------- + +This document has been placed in the public domain. [1]_ + +References and Footnotes +------------------------ + +.. [1] _Open Publication License: https://www.opencontent.org/openpub/ + + diff --git a/slep014/proposal.rst b/slep014/proposal.rst new file mode 100644 index 0000000..adf8fbc --- /dev/null +++ b/slep014/proposal.rst @@ -0,0 +1,264 @@ +.. _slep_014: + +============================== +SLEP014: Pandas In, Pandas Out +============================== + +:Author: Thomas J Fan +:Status: Rejected +:Type: Standards Track +:Created: 2020-02-18 + +Abstract +######## + +This SLEP proposes using pandas DataFrames for propagating feature names +through ``scikit-learn`` transformers. + +Motivation +########## + +``scikit-learn`` is commonly used as a part of a larger data processing +pipeline. When this pipeline is used to transform data, the result is a +NumPy array, discarding column names. The current workflow for +extracting the feature names requires calling ``get_feature_names`` on the +transformer that created the feature. This interface can be cumbersome when used +together with a pipeline with multiple column names:: + + import pandas as pd + import numpy as np + from sklearn.compose import make_column_transformer + from sklearn.preprocessing import OneHotEncoder, StandardScaler + from sklearn.pipeline import make_pipeline + from sklearn.linear_model import LogisticRegression + + X = pd.DataFrame({'letter': ['a', 'b', 'c'], + 'pet': ['dog', 'snake', 'dog'], + 'num': [1, 2, 3]}) + y = [0, 0, 1] + orig_cat_cols, orig_num_cols = ['letter', 'pet'], ['num'] + + ct = make_column_transformer( + (OneHotEncoder(), orig_cat_cols), (StandardScaler(), orig_num_cols)) + pipe = make_pipeline(ct, LogisticRegression()).fit(X, y) + + cat_names = (pipe['columntransformer'] + .named_transformers_['onehotencoder'] + .get_feature_names(orig_cat_cols)) + + feature_names = np.r_[cat_names, orig_num_cols] + +The ``feature_names`` extracted above corresponds to the features directly +passed into ``LogisticRegression``. As demonstrated above, the process of +extracting ``feature_names`` requires knowing the order of the selected +categories in the ``ColumnTransformer``. Furthemore, if there is feature +selection in the pipeline, such as ``SelectKBest``, the ``get_support`` method +would need to be used to determine column names that were selected. + +Solution +######## + +The pandas DataFrame has been widely adopted by the Python Data ecosystem to +store data with feature names. This SLEP proposes using a DataFrame to +track the feature names as the data is transformed. With this feature, the +API for extracting feature names would be:: + + from sklearn import set_config + set_config(pandas_in_out=True) + + pipe.fit(X, y) + X_trans = pipe[:-1].transform(X) + + X_trans.columns.tolist() + ['letter_a', 'letter_b', 'letter_c', 'pet_dog', 'pet_snake', 'num'] + +This SLEP proposes attaching feature names to the output of ``transform``. In +the above example, ``pipe[:-1].transform(X)`` propagates the feature names +through the multiple transformers. + +This feature is only available through a soft dependency on pandas. Furthermore, +it will be opt-in with the the configuration flag: ``pandas_in_out``. By +default, ``pandas_in_out`` is set to ``False``, resulting in the output of all +estimators to be a ndarray. + +Enabling Functionality +###################### + +The following enhancements are **not** a part of this SLEP. These features are +made possible if this SLEP gets accepted. + +1. Allows estimators to treat columns differently based on name or dtype. For + example, the categorical dtype is useful for tree building algorithms. + +2. Storing feature names inside estimators for model inspection:: + + from sklearn import set_config + set_config(store_feature_names_in=True) + + pipe.fit(X, y) + + pipe['logisticregression'].feature_names_in_ + +3. Allow for extracting the feature names of estimators in meta-estimators:: + + from sklearn import set_config + set_config(store_feature_names_in=True) + + est = BaggingClassifier(LogisticRegression()) + est.fit(X, y) + + # Gets the feature names used by an estimator in the ensemble + est.estimators_[0].feature_names_in_ + +For options 2 and 3 the default value of configuration flag: +``store_feature_names_in`` is False. + +Considerations +############## + +Memory copies +------------- + +As noted in `pandas #27211 <https://github.com/pandas-dev/pandas/issues/27211>`_, +there is not a guarantee that there is a zero-copy round-trip going from numpy +to a DataFrame. In other words, the following may lead to a memory copy in +a future version of ``pandas``:: + + X = np.array(...) + X_df = pd.DataFrame(X) + X_again = np.asarray(X_df) + +This is an issue for ``scikit-learn`` when estimators are placed into a +pipeline. For example, consider the following pipeline:: + + set_config(pandas_in_out=True) + pipe = make_pipeline(StandardScaler(), LogisticRegression()) + pipe.fit(X, y) + +Interally, ``StandardScaler.fit_transform`` will operate on a ndarray and +wrap the ndarray into a DataFrame as a return value. This is will be +piped into ``LogisticRegression.fit`` which calls ``check_array`` on the +DataFrame, which may lead to a memory copy in a future version of +``pandas``. This leads to unnecessary overhead from piping the data from one +estimator to another. + +Sparse matrices +--------------- + +Traditionally, ``scikit-learn`` prefers to process sparse matrices in +the compressed sparse row (CSR) matrix format. The `sparse data structure <https://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html>`_ in pandas 1.0 only supports converting directly to +the coordinate format (COO). Although this format was designed to quickly +convert to CSR or CSC formats, the conversion process still needs to allocate +more memory to store. This can be an issue with transformers such as the +``OneHotEncoder.transform`` which has been optimized to construct a CSR matrix. + +Backward compatibility +###################### + +The ``set_config(pandas_in_out=True)`` global configuration flag will be set to +``False`` by default to ensure backward compatibility. When this flag is False, +the output of all estimators will be a ndarray. + +Community Adoption +################## + +With the new ``pandas_in_out`` configuration flag, third party libraries may +need to query the configuration flag to be fully compliant with this SLEP. +Specifically, "to be fully compliant" entails the following policy: + +1. If ``pandas_in_out=False``, then ``transform`` always returns numpy array. +2. If ``pandas_in_out=True``, then ``transform`` returns a DataFrame if the + input is a Dataframe. + +This policy can either be enforced with ``check_estimator`` or not: + +- **Enforce**: This increases the maintaince burden of third party libraries. + This burden includes: checking for the configuration flag, generating feature names and including pandas as a dependency to their library. + +- **Not Enforce**: Currently, third party transformers can return a DataFrame + or a numpy and this is mostly compatible with ``scikit-learn``. Users with + third party transformers would not be able to access the features enabled + by this SLEP. + + +Alternatives +############ + +This section lists alternative data structures that can be used with their +advantages and disadvantages when compared to a pandas DataFrame. + +InputArray +---------- + +The proposed ``InputArray`` described +:ref:`SLEP012 Custom InputArray Data Structure <slep_012>` introduces a new +data structure for homogenous data. + +Pros +~~~~ + +- A thin wrapper around a numpy array or a sparse matrix with a minimial feature + set that ``scikit-learn`` can evolve independently. + +Cons +~~~~ + +- Introduces another data structure for data storage in the PyData ecosystem. +- Currently, the design only allows for homogenous data. +- Increases maintenance responsibilities for ``scikit-learn``. + +XArray Dataset +-------------- + +`xarray's Dataset <http://xarray.pydata.org/en/stable/data-structures.html#dataset>`_ +is a multi-dimenstional version of panda's DataFrame. + +Pros +~~~~ + +- Can be used for heterogeneous data. + +Cons +~~~~ + +- ``scikit-learn`` does not require many of the features Dataset provides. +- Needs to be converted to a DataArray before it can be converted to a numpy array. +- The `conversion from a pandas DataFrame to a Dataset <http://xarray.pydata.org/en/stable/pandas.html>`_ + is not lossless. For example, categorical dtypes in a pandas dataframe will + lose their categorical information when converted to a Dataset. +- xarray does not have as much adoption as pandas, which increases the learning + curve for using Dataset with ``scikit-learn``. + +XArray DataArray +---------------- + +`xarray's DataArray <http://xarray.pydata.org/en/stable/data-structures.html#dataarray>`_ +is a data structure that store homogenous data. + +Pros +~~~~ + +- xarray guarantees that there will be no copies during round-trips from + numpy. (`xarray #3077 <https://github.com/pydata/xarray/issues/3077>`_) + +Cons +~~~~ + +- Can only be used for homogenous data. +- As with XArray's Dataset, DataArray does not as much adoption as pandas, + which increases the learning curve for using DataArray with ``scikit-learn``. + +References and Footnotes +######################## + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. + +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +######### + +This document has been placed in the public domain. [1]_ diff --git a/slep015/proposal.rst b/slep015/proposal.rst new file mode 100644 index 0000000..bea2d8f --- /dev/null +++ b/slep015/proposal.rst @@ -0,0 +1,191 @@ +.. _slep_015: + +================================== +SLEP015: Feature Names Propagation +================================== + +:Author: Thomas J Fan +:Status: Rejected +:Type: Standards Track +:Created: 2020-10-03 + +Abstract +######## + +This SLEP proposes adding the ``get_feature_names_out`` method to all +transformers and the ``feature_names_in_`` attribute for all estimators. +The ``feature_names_in_`` attribute is set during ``fit`` if the input, ``X``, +contains the feature names. + +Motivation +########## + +``scikit-learn`` is commonly used as a part of a larger data processing +pipeline. When this pipeline is used to transform data, the result is a +NumPy array, discarding column names. The current workflow for +extracting the feature names requires calling ``get_feature_names`` on the +transformer that created the feature. This interface can be cumbersome when used +together with a pipeline with multiple column names:: + + X = pd.DataFrame({'letter': ['a', 'b', 'c'], + 'pet': ['dog', 'snake', 'dog'], + 'distance': [1, 2, 3]}) + y = [0, 0, 1] + orig_cat_cols, orig_num_cols = ['letter', 'pet'], ['num'] + + ct = ColumnTransformer( + [('cat', OneHotEncoder(), orig_cat_cols), + ('num', StandardScaler(), orig_num_cols)]) + pipe = make_pipeline(ct, LogisticRegression()).fit(X, y) + + cat_names = (pipe['columntransformer'] + .named_transformers_['onehotencoder'] + .get_feature_names(orig_cat_cols)) + + feature_names = np.r_[cat_names, orig_num_cols] + +The ``feature_names`` extracted above corresponds to the features directly +passed into ``LogisticRegression``. As demonstrated above, the process of +extracting ``feature_names`` requires knowing the order of the selected +categories in the ``ColumnTransformer``. Furthermore, if there is feature +selection in the pipeline, such as ``SelectKBest``, the ``get_support`` method +would need to be used to infer the column names that were selected. + +Solution +######## + +This SLEP proposes adding the ``feature_names_in_`` attribute to all estimators +that will extract the feature names of ``X`` during ``fit``. This will also +be used for validation during non-``fit`` methods such as ``transform`` or +``predict``. If the ``X`` is not a recognized container with columns, then +``feature_names_in_`` can be undefined. If ``feature_names_in_`` is undefined, +then it will not be validated. + +Secondly, this SLEP proposes adding ``get_feature_names_out(input_names=None)`` +to all transformers. By default, the input features will be determined by the +``feature_names_in_`` attribute. The feature names of a pipeline can then be +easily extracted as follows:: + + pipe[:-1].get_feature_names_out() + # ['cat__letter_a', 'cat__letter_b', 'cat__letter_c', + 'cat__pet_dog', 'cat__pet_snake', 'num__distance'] + +Note that ``get_feature_names_out`` does not require ``input_names`` +because the feature names was stored in the pipeline itself. These +features will be passed to each step's ``get_feature_names_out`` method to +obtain the output feature names of the ``Pipeline`` itself. + +Enabling Functionality +###################### + +The following enhancements are **not** a part of this SLEP. These features are +made possible if this SLEP gets accepted. + +1. This SLEP enables us to implement an ``array_out`` keyword argument to + all ``transform`` methods to specify the array container outputted by + ``transform``. An implementation of ``array_out`` requires + ``feature_names_in_`` to validate that the names in ``fit`` and + ``transform`` are consistent. An implementation of ``array_out`` needs + a way to map from the input feature names to output feature names, which is + provided by ``get_feature_names_out``. + +2. An alternative to ``array_out``: Transformers in a pipeline may wish to have + feature names passed in as ``X``. This can be enabled by adding a + ``array_input`` parameter to ``Pipeline``:: + + pipe = make_pipeline(ct, MyTransformer(), LogisticRegression(), + array_input='pandas') + + In this case, the pipeline will construct a pandas DataFrame to be inputted + into ``MyTransformer`` and ``LogisticRegression``. The feature names + will be constructed by calling ``get_feature_names_out`` as data is passed + through the ``Pipeline``. This feature implies that ``Pipeline`` is + doing the construction of the DataFrame. + +Considerations and Limitations +############################## + +1. The ``get_feature_names_out`` will be constructed using the name generation + specification from :ref:`slep_007`. + +2. For a ``Pipeline`` with only one estimator, slicing will not work and one + would need to access the feature names directly:: + + pipe1 = make_pipeline(StandardScaler(), LogisticRegression()) + pipe[:-1].feature_names_in_ # Works + + pipe2 = make_pipeline(LogisticRegression()) + pipe[:-1].feature_names_in_ # Does not work + + This is because `pipe2[:-1]` raises an error because it will result in + a pipeline with no steps. We can work around this by allowing pipelines + with no steps. + +3. ``feature_names_in_`` can be any 1-D ``Sequence``, such as an list or + an ndarray. + +4. Meta-estimators will delegate the setting and validation of + ``feature_names_in_`` to its inner estimators. The meta-estimator will + define ``feature_names_in_`` by referencing its inner estimators. For + example, the ``Pipeline`` can use ``steps[0].feature_names_in_`` as + the input feature names. If the inner estimators do not define + ``feature_names_in_`` then the meta-estimator will not defined + ``feature_names_in_`` as well. + +Backward compatibility +###################### + +1. This SLEP is fully backward compatible with previous versions. With the + introduction of ``get_feature_names_out``, ``get_feature_names`` will + be deprecated. Note that ``get_feature_names_out``'s signature will + always contain ``input_features`` which can be used or ignored. This + helps standardize the interface for the get feature names method. + +2. The inclusion of a ``get_feature_names_out`` method will not introduce any + overhead to estimators. + +3. The inclusion of a ``feature_names_in_`` attribute will increase the size of + estimators because they would store the feature names. Users can remove + the attribute by calling ``del est.feature_names_in_`` if they want to + remove the feature and disable validation. + +Alternatives +############ + +There have been many attempts to address this issue: + +1. ``array_out`` in keyword parameter in ``transform`` : This approach requires + third party estimators to unwrap and wrap array containers in transform, + which introduces more burden for third party estimator maintainers. + Furthermore, ``array_out`` with sparse data will introduce an overhead when + being passed along in a ``Pipeline``. This overhead comes from the + construction of the sparse data container that has the feature names. + +2. :ref:`slep_007` : ``SLEP007`` introduces a ``feature_names_out_`` attribute + while this SLEP proposes a ``get_feature_names_out`` method to accomplish + the same task. The benefit of the ``get_feature_names_out`` method is that + it can be used even if the feature names were not passed in ``fit`` with a + dataframe. For example, in a ``Pipeline`` the feature names are not passed + through to each step and a ``get_feature_names_out`` method can be used to + get the names of each step with slicing. + +3. :ref:`slep_012` : The ``InputArray`` was developed to work around the + overhead of using a pandas ``DataFrame`` or an xarray ``DataArray``. The + introduction of another data structure into the Python Data Ecosystem, would + lead to more burden for third party estimator maintainers. + + +References and Footnotes +######################## + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. + +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +######### + +This document has been placed in the public domain. [1]_ diff --git a/slep017/proposal.rst b/slep017/proposal.rst new file mode 100644 index 0000000..da0e508 --- /dev/null +++ b/slep017/proposal.rst @@ -0,0 +1,184 @@ +=========================================================== +SLEP017: Clone Override Protocol with ``__sklearn_clone__`` +=========================================================== + +:Author: Joel Nothman +:Status: Accepted +:Type: Standards Track +:Created: 2022-03-19 +:scikit-learn-Version: 1.3.0 +:Resolution: https://github.com/scikit-learn/enhancement_proposals/pull/79 + +Abstract +-------- + +The ability to clone Scikit-learn estimators -- removing any state due to +previous fitting -- is essential to ensuring estimator configurations are +reusable across multiple instances in cross validation. +A centralised implementation of :func:`sklearn.base.clone` regards +an estimator's constructor parameters as the state that should be copied. +This proposal allows for an estimator class to implement custom cloning +functionality with a ``__sklearn_clone__`` method, which will default to +the current ``clone`` behaviour. + +Detailed description +-------------------- + +Cloning estimators is one way that Scikit-learn ensures that there is no +data leakage across data splits in cross-validation: by only copying an +estimator's configuration, with no data from previous fitting, the +estimator must fit with a cold start. Cloning an estimator often also +occurs prior to parallelism, ensuring that a minimal version of the +estimator -- without a large stored model -- is serialised and distributed. + +Cloning is currently governed by the implementation of +:func:`sklearn.base.clone`, which recursively descends and copies the +parameters of the passed object. For an estimator, it constructs a new +instance of the estimator's class, passing to it cloned versions of the +parameter values returned by its ``get_params``. It then performs some +sanity checks to ensure that the values passed to the construtor are +identical to what is then returned by the clone's ``get_params``. + +The current equivalence between constructor parameters and what is cloned +means that whenever an estimator or library developer deems it necessary +to have further configuration of an estimator reproduced in a clone, +they must include this configuration as a constructor parameter. + +Cases where this need has been raised in Scikit-learn development include: + +* ensuring metadata requests are cloned with an estimator +* ensuring parameter spaces are cloned with an estimator +* building a simple wrapper that can "freeze" a pre-fitted estimator +* allowing existing options for using prefitted models in ensembles + to work under cloning + +The current design also limits the ability for an estimator developer to +define an exception to the sanity checks (see :issue:`15371`). + +This proposal empowers estimator developers to extend the base implementation +of ``clone`` by providing a ``__sklearn_clone__`` method, which ``clone`` will +delegate to when available. The default implementaton will match current +``clone`` behaviour. It will be provided through +``BaseEstimator.__sklearn_clone__`` but also +provided for estimators not inheriting from :obj:`~sklearn.base.BaseEstimator`. + +This shifts the paradigm from ``clone`` being a fixed operation that +Scikit-learn must be able to perform on an estimator to ``clone`` being a +behaviour that each Scikit-learn compatible estimator may implement. + +Developers that define ``__sklearn_clone__`` are expected to be responsible +in maintaintaining the fundamental properties of cloning. Ordinarily, they +can achieve this through use of ``super().__sklearn_clone__``. Core behaviours, +such as constructor parameters being preserved through ``clone`` operations, +can be ensured through estimator checks. + +Implementation +-------------- + +Implementing this SLEP will require: + +1. Factoring out `clone_parametrized` from `clone`, being the portion of its + implementation that handles objects with `get_params`. +2. Modifying `clone` to call ``__sklearn_clone__`` when available on an + object with ``get_params``, or ``clone_parametrized`` when not available. +3. Defining ``BaseEstimator.__sklearn_clone__`` to call ``clone_parametrized``. +4. Documenting the above. + +Backward compatibility +---------------------- + +No breakage. + +Alternatives +------------ + +Instead of allowing estimators to overwrite the entire clone process, +the core clone process could be obligatory, with the ability for an +estimator class to customise additional steps. + +One API would allow for an estimator class to provide +``__sklearn__post_clone__(self, source)`` for operations in addition +to the core cloning, or ``__sklearn__clone_attrs__`` could be defined +on a class to specify additional attributes that should be copied for +that class and its descendants. + +Alternative solutions include continuing to force developers into providing +sometimes-awkward constructor parameters for any clonable material, and +Scikit-learn core developers having the exceptional ability to extend +the ``clone`` function as needed. + +Discussion +---------- + +:issue:`5080` raised the proposal of polymorphism for ``clone`` as the right +way to provide an object-oriented API, and as a way to enable the +implementation of wrappers around estimators for model memoisation and +freezing. +The naming of ``__sklearn_clone__`` was further proposed and discussed in +:issue:`21838`. + +Making cloning more flexible either enables or simplifies the design and +implementation of several features, including wrapping pre-fitted estimators, +and providing estimator configuration through methods without adding new +constructor arguments (e.g. through mixins). + +Related issues include: + +- :issue:`6451`, :issue:`8710`, :issue:`19848`: CalibratedClassifierCV with + prefitted base estimator +- :issue:`7382`: VotingClassifier with prefitted base estimator +- :issue:`16748`: Stacking estimator with prefitted base estimator +- :issue:`8370`, :issue:`9464`: generic estimator wrapper for model freezing +- :issue:`5082`: configuring parameter search spaces +- :issue:`16079`: configuring the routing of sample-aligned metadata +- :issue:`16185`: configuring selected parameters to not be deep-copied + +Under the incumbent monolithic clone implementation, designing such additional +per-estimator configuration requires resolving whether to: + +- adjust the monolithic ``clone`` to account for the new configuration + attributes (an option only available to the Scikit-learn core developer + team); +- add constructor attributes for each new configuration option; or +- not clone estimator configurations, and accept that some use cases may not + be possible. + +A more flexible cloning operation provides a simpler pattern for adding new +configuration options through mixins. +It should be noted that adding new capabilities to *all* estimators remains +possible only through modifying the default ``__sklearn_clone__`` +implementation. + +There are, however, notable concerns in relation to this proposal. +Introducing a generic clone handler on each estimator gives a developer +complete freedom to disregard existing conventions regarding parameter +setting and construction in Scikit-learn. +In this vein, objections to :issue:`5080` cited the notion that "``clone`` +has a simple contract," and that "extension to it would open the door to +violations of that contract" [2]_. + +While these objections identify considerable risks, many public libraries +include developers regularly working around Scikit-learn conventions and +contracts, in part because developers are backed into a "design corner", +wherein it is not always obvious how to build an acceptable UX while adhering +to established conventions; in this case, that everything to be cloned must +go into ``__init__``. This proposal paves a road for how developers can +solve functionality UX limitations in the core library, rather than +inviting custom workarounds. + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. +.. _Open Publication License: https://www.opencontent.org/openpub/ + +.. [2] `Gael Varoquaux's comments on #5080 in 2015 + <https://github.com/scikit-learn/scikit-learn/issues/5080#issuecomment-127128808>`__ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_ diff --git a/slep018/proposal.rst b/slep018/proposal.rst new file mode 100644 index 0000000..f4b830f --- /dev/null +++ b/slep018/proposal.rst @@ -0,0 +1,143 @@ +.. _slep_018: + +======================================================= +SLEP018: Pandas Output for Transformers with set_output +======================================================= + +:Author: Thomas J. Fan +:Status: Accepted +:Type: Standards Track +:Created: 2022-06-22 + +Abstract +-------- + +This SLEP proposes a ``set_output`` method to configure the output data container of +scikit-learn transformers. + +Detailed description +-------------------- + +Currently, scikit-learn transformers return NumPy ndarrays or SciPy sparse +matrices. This SLEP proposes adding a ``set_output`` method to configure a +transformer to output pandas DataFrames:: + + scalar = StandardScaler().set_output(transform="pandas") + scalar.fit(X_df) + + # X_trans_df is a pandas DataFrame + X_trans_df = scalar.transform(X_df) + +The index of the output DataFrame must match the index of the input. If the +transformer does not support ``transform="pandas"``, then it must raise a +``ValueError`` stating that it does not support the feature. + +This SLEP's only focus is dense data for ``set_output``. If a transformer returns +sparse data, e.g. ``OneHotEncoder(sparse=True)``, then ``transform`` will raise a +``ValueError`` if ``set_output(transform="pandas")``. Dealing with sparse output +might be the scope of another future SLEP. + +For a pipeline, calling ``set_output`` will configure all inner transformers and +does not configure non-transformers. This enables the following workflow:: + + log_reg = make_pipeline(SimpleImputer(), StandardScaler(), LogisticRegression()) + log_reg.set_output(transform="pandas") + + # All transformers return DataFrames during fit + log_reg.fit(X_df, y) + + # X_trans_df is a pandas DataFrame + X_trans_df = log_reg[:-1].transform(X_df) + + # X_trans_df is again a pandas DataFrame + X_trans_df = log_reg[0].transform(X_df) + + # The classifier contains the feature names in + log_reg[-1].feature_names_in_ + +Meta-estimators that support ``set_output`` are required to configure all inner +transformers by calling ``set_output``. Specifically all fitted and non-fitted +inner transformers must be configured with ``set_output``. This enables +``transform``'s output to be a DataFrame before and after the meta-estimator is +fitted. If an inner transformer does not define ``set_output``, then an error is +raised. + + +Global Configuration +.................... + +For ease of use, this SLEP proposes a global configuration flag that sets the output for all +transformers:: + + import sklearn + sklearn.set_config(transform_output="pandas") + +The global default configuration is ``"default"`` where the transformer +determines the output container. + +The configuration can also be set locally using the ``config_context`` context +manager:: + + from sklearn import config_context + with config_context(transform_output="pandas"): + num_prep = make_pipeline(SimpleImputer(), StandardScaler(), PCA()) + num_preprocessor.fit_transform(X_df) + +The following specifies the precedence levels for the three ways to configure +the output container: + +1. Locally configure a transformer: ``transformer.set_output`` +2. Context manager: ``config_context`` +3. Global configuration: ``set_config`` + +Implementation +-------------- + +A possible implementation of this SLEP is worked out in :pr:`23734`. + +Backward compatibility +---------------------- + +There are no backward compatibility concerns, because the ``set_output`` method +is a new API. Third party transformers can opt-in to the API by defining +``set_output``. + +Alternatives +------------ + +Alternatives to this SLEP includes: + +1. `SLEP014 <https://github.com/scikit-learn/enhancement_proposals/pull/37>`__ + proposes that if the input is a DataFrame than the output is a DataFrame. +2. Prototype `#20100 + <https://github.com/scikit-learn/scikit-learn/pull/20100>`__ showcases + ``array_out="pandas"`` in `transform`. This API is limited because does not + directly support fitting on a pipeline where the steps requires data frames + input. + +Discussion +---------- + +A list of issues discussing Pandas output are: `#14315 +<https://github.com/scikit-learn/scikit-learn/pull/14315>`__, `#20100 +<https://github.com/scikit-learn/scikit-learn/pull/20100>`__, and `#23001 +<https://github.com/scikit-learn/scikit-learn/issueas/23001>`__. This SLEP +proposes configuring the output to be pandas because it is the DataFrame library +that is most widely used and requested by users. The ``set_output`` API can be +extended to support additional DataFrame libraries and sparse data formats in +the future. + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open Publication + License`_. + +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_ diff --git a/slep019/proposal.rst b/slep019/proposal.rst new file mode 100644 index 0000000..b416b90 --- /dev/null +++ b/slep019/proposal.rst @@ -0,0 +1,221 @@ +.. _slep_019: + +#################################################################### + SLEP019: Governance Update - Recognizing Contributions Beyond Code +#################################################################### + +:Author: Julien Jerphanion <git@jjerphan.xyz>, Gaël Varoquaux <gael.varoquaux@normalesup.org> +:Status: Withdrawn +:Type: Process +:Created: 2022-09-12 + +********** + Abstract +********** + +This SLEP proposes updating the Governance to broaden the notion of +contribution in scikit-learn and to ease subsequent related changes to +the Governance without requiring SLEPs. + +************ + Motivation +************ + +Current state +============= + +The formal decision making process of the scikit-learn project is +limited to a subset of contributors, called Core Developers (also +refered to as Maintainers). Their active and consistent contributions +are recognized by them: + +- being part of scikit-learn organisation on GitHub +- receiving “commit rights” to the repository +- having their Pull Request reviews recognised as authoritative +- having voting rights for the project direction (promoting a + contributor to be a core-developer, approving a SLEP, etc.) + +Core Developers are primarily selected based on their code +contributions. However, there are a lot of other ways to contribute to +the project, and these efforts are currently not recognized [1]_. To +quote Melissa Weber Mendonça [2]_ and Reshama Shaikh [3]_: + +.. epigraph:: + + "When some people join an open source project, they may be asked to contribute + with tasks that will never get them on a path to any sort of official input, + such as voting rights." + +Desired Goal: incrementally adapt the Governance +================================================ + +We need to: + +- value non-coding contributions in the project and acknowledge all + efforts, including those that are not quantified by GitHub users' + activity + +- empower more contributors to effectively participate in the project + without requiring the security responsibilities of tracking code + changes to the main branches. These considerations should lead to the + diversification of contribution paths [4]_. + +Rather than introducing entirely new structure and Governance, we +propose changes to the existing ones which allow for small incremental +modifications over time. + +****************** + Proposed changes +****************** + +Some of the proposed modification have been discussed in the monthly +meetings, on April 25th 2022 [5]_ and September 5th 2022 [6]_. + +Define "Contributions" more broadly +=================================== + +Explicitly define Contributions and emphasize the importance of non-code +contributions in the Governance structure. + +Evolve the Technical Committee into a Steering Committee +======================================================== + +Rename "Technical Committee" to "Steering Committee". + +Define the Steering Committee as a subset of Core Contributors rather +than a subset of Core Developers. + +Create a Triage Team +==================== + +Create a Triage Team which would be given "Write" permissions on GitHub +[7]_ to be able to perform triaging tasks, such as editing issues' +description. + +Define "Core Contributors" +========================== + +Establish all members of the following teams as "Core Contributors": + + - Triage Team + - Communication Team + - Development Team + +A Contributor is promoted to a Core Contributor after being proposed by +at least one existing Core Contributor. The proposal must specify which +Core Team the Contributor will be part of. The promotion is effective +after a vote on the private Core Contributor mailing list which must +last for two weeks and which must reach at least two-thirds positive +majority of the cast votes. + +Extend voting rights +==================== + +Give voting rights to all Core Contributors. + +Simplify subsequent changes to the Governance +============================================= + +Allow changes to the following aspects of the scikit-learn Governance +without requiring a SLEP: + + - additions and changes to Roles' and Teams' scopes + - additions and changes to Roles' and Teams' permissions + +Any changes to the scikit-learn Governance (including ones which do not +require being back by a SLEP) will continue to be subject to the +decision making process [8]_, which includes a vote of the Core +Contributors. + +If subsequent changes to the Governance are proposed through a GitHub +Pull Request (PR): + + - a positive vote is cast by approving the PR (i.e. "Approve" + review) + - a negative vote is cast by requesting changes to the PR (i.e. + "Request changes" review) + +In this case, the vote still has to be announced on the Core +Contributors' mailing list, but the system of Pull Request approvals +will replace a vote on the private Core Contributors' mailing list. + + +************** +Acknowledgment +************** + +We thank the following people who have helped with discussions during the +development of this SLEP: + +- Lucy Liu: https://github.com/lucyleeow +- Noa Tamir: https://github.com/noatamir +- Reshama Shaikh: https://github.com/reshamas +- Tim Head: https://github.com/betatim + +*********** + Copyright +*********** + +This document has been placed in the public domain [9]_. + +************************** + References and Footnotes +************************** + +.. [1] + + J. -G. Young, A. Casari, K. McLaughlin, M. Z. Trujillo, L. + Hébert-Dufresne and J. P. Bagrow, "Which contributions count? Analysis + of attribution in open source," 2021 IEEE/ACM 18th International + Conference on Mining Software Repositories (MSR), 2021, pp. 242-253, + doi: 10.1109/MSR52588.2021.00036: https://arxiv.org/abs/2103.11007 + +.. [2] + + Contributor experience, diversity and culture in Open Source Projects: + keynote from Melissa Weber Mendonça: + https://2022.pycon.de/program/NVBLKH/ + +.. [3] + + Reshama Shaikh's quote from Melissa Weber Mendonça' keynote: + https://twitter.com/reshamas/status/1513488342767353857 + +.. [4] + + NumPy Newcomer's Hour: an Experiment on Community Building, talk from + Melissa Weber Mendonça: https://www.youtube.com/watch?v=c0XZQbu0xnw + +.. [5] + + scikit-learn April 25th 2022 Developer meeting notes: + https://github.com/scikit-learn/administrative/blob/master/meeting_notes/2022-04-25.md + +.. [6] + + scikit-learn September 5th 2022 Developer meeting notes: + https://github.com/scikit-learn/administrative/blob/master/meeting_notes/2022-09-05.md + +.. [7] + + Permissions for each role, Repository roles for an organization, GitHub + Docs: + https://docs.github.com/en/organizations/managing-access-to-your-organizations-repositories/repository-roles-for-an-organization#permissions-for-each-role + +.. [8] + + Decision Making Process, scikit-learn Governance and Decision-Making: + https://scikit-learn.org/dev/governance.html#decision-making-process + +.. [9] + + Open Publication License: https://www.opencontent.org/openpub/ + + +**** +Note +**** + +Since SLEP020 allows us to modify the governance w/o a SLEP requirement, many +discussions from this SLEP are to be discussed and implemented in consequent +PRs directly to change the governance, on the main repo. diff --git a/slep020/proposal.rst b/slep020/proposal.rst new file mode 100644 index 0000000..5037d4e --- /dev/null +++ b/slep020/proposal.rst @@ -0,0 +1,63 @@ +.. _slep_020: + +======================================= +SLEP020: Simplifying Governance Changes +======================================= + +:Author: Thomas J Fan +:Status: Accepted +:Type: Process +:Created: 2023-01-09 +:Resolution: https://github.com/scikit-learn/enhancement_proposals/pull/85 + +Abstract +-------- + +This SLEP proposes to permit governance changes through GitHub Pull Requests, +where a vote will also occur in the Pull Request. + +Detailed description +-------------------- + +Currently, scikit-learn's governance document [2]_ requires an enhancement +proposal to make any changes to the governance document. In this SLEP, we +propose simplifying the process by allowing governance changes through GitHub +Pull Requests. Once the authors are happy with the state of the Pull Request, +they can call for a vote on the mailing list. No changes are allowed until the +vote is closed. A Pull Request approval will count as a positive vote, and a +"Request Changes" review will count as a negative vote. The voting period will +remain one month as stated in the current Governance and Decision-Making +Document [2]_. + +Discussion +---------- + +Members of the scikit-learn community have discussed changing the governance +through :ref:`SLEP019 <slep_019>` in following PRs: + +1. `enhancement_proposals#74 <https://github.com/scikit-learn/enhancement_proposals/pull/74>`__ + proposed updating the Governance to broaden the notion of contribution in scikit-learn. + The draft was approved and merged on 2022-11-18. +2. `enhancement_proposals#81 <https://github.com/scikit-learn/enhancement_proposals/pull/81>`__ + proposed updates to :ref:`SLEP019 <slep_019>`. + +:ref:`SLEP019 <slep_019>` also includes the voting change proposed in this SLEP. +This SLEP's goal is to simplify the process of making governance changes, thus +enabling the governance structure to evolve more efficiently. + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open Publication + License`_. +.. [2] `scikit-learn Governance and Decision-Making + <https://scikit-learn.org/stable/governance.html#decision-making-process>`__ + +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_ diff --git a/slep_template.rst b/slep_template.rst new file mode 100644 index 0000000..b05f710 --- /dev/null +++ b/slep_template.rst @@ -0,0 +1,77 @@ +============================== +SLEP Template and Instructions +============================== + +:Author: <list of authors' real names and optionally, email addresses> +:Status: <Draft | Active | Accepted | Deferred | Rejected | Withdrawn | + Final | Superseded> +:Type: <Standards Track | Process> +:Created: <date created on, in yyyy-mm-dd format> +:Resolution: <url> (required for Accepted | Rejected | Withdrawn) + +Abstract +-------- + +The abstract should be a short description of what the SLEP will achieve. + + +Detailed description +-------------------- + +This section describes the need for the SLEP. It should describe the +existing problem that it is trying to solve and why this SLEP makes the +situation better. It should include examples of how the new functionality +would be used and perhaps some use cases. + + +Implementation +-------------- + +This section lists the major steps required to implement the SLEP. Where +possible, it should be noted where one step is dependent on another, and which +steps may be optionally omitted. Where it makes sense, each step should +include a link related pull requests as the implementation progresses. + +Any pull requests or developmt branches containing work on this SLEP should +be linked to from here. (A SLEP does not need to be implemented in a single +pull request if it makes sense to implement it in discrete phases). + + +Backward compatibility +---------------------- + +This section describes the ways in which the SLEP breaks backward +compatibility. + + +Alternatives +------------ + +If there were any alternative solutions to solving the same problem, they +should be discussed here, along with a justification for the chosen +approach. + + +Discussion +---------- + +This section may just be a bullet list including links to any discussions +regarding the SLEP: + +- This includes links to mailing list threads or relevant GitHub issues. + + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. + +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_