From cf39e6aec6c26d4fd8f8aecc2af1129810b076c9 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 4 Aug 2020 00:13:26 +1000 Subject: [PATCH 1/4] Examples for case 0a --- slep006/cases_opt0a.py | 94 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/slep006/cases_opt0a.py b/slep006/cases_opt0a.py index d0141fa..24656f4 100644 --- a/slep006/cases_opt0a.py +++ b/slep006/cases_opt0a.py @@ -1,6 +1,98 @@ +import numpy as np + from defs import (accuracy, group_cv, make_scorer, SelectKBest, LogisticRegressionCV, cross_validate, make_pipeline, X, y, my_groups, my_weights, my_other_weights) -# TODO +# %% +# Case A: weighted scoring and fitting + + +GROUPS_IDX = -1 +WEIGHT_IDX = -2 + + +def unwrap_X(X): + return X[:, -2:] + + +class WrappedGroupCV: + def __init__(self, base_cv, groups_idx=GROUPS_IDX): + self.base_cv = base_cv + self.groups_idx = groups_idx + + def split(self, X, y, groups=None): + groups = X[:, self.groups_idx] + return self.base_cv.split(unwrap_X(X), y, groups=groups) + + def get_n_splits(self, X, y, groups=None): + groups = X[:, self.groups_idx] + return self.base_cv.split(unwrap_X(X), y, groups=groups) + + +wrapped_group_cv = WrappedGroupCV(group_cv) + + +class WrappedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) + + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) + + +def wrapped_weighted_acc(est, X, y, sample_weight=None): + return weighted_acc(est, unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) + + +lr = WrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case B: weighted scoring and unweighted fitting + +class UnweightedWrappedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(unwrap_X(X), y) + + +lr = UnweightedWrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + + +# %% +# Case C: unweighted feature selection + +class UnweightedWrappedSelectKBest(SelectKBest): + def fit(self, X, y): + return super().fit(unwrap_X(X), y) + + +lr = WrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=weighted_acc, +).set_props_request(['sample_weight']) +sel = UnweightedWrappedSelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +SCORING_WEIGHT_IDX = -3 + +# TODO: proceed from here. Note that this change implies the need to add +# a parameter to unwrap_X, since we will now append an additional column to X. From 70f7a951893c04114066f461575881e3097e75af Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 4 Aug 2020 00:14:00 +1000 Subject: [PATCH 2/4] Add variant for solution 4 --- slep006/cases_opt4b.py | 78 ++++++++++++++++++++++++++++++++++++++++++ slep006/proposal.rst | 26 +++++++++++++- 2 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 slep006/cases_opt4b.py diff --git a/slep006/cases_opt4b.py b/slep006/cases_opt4b.py new file mode 100644 index 0000000..58aeaa4 --- /dev/null +++ b/slep006/cases_opt4b.py @@ -0,0 +1,78 @@ +from defs import (accuracy, group_cv, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +# Here we presume that GroupKFold requests `groups` by default. +# We need to explicitly request weights in make_scorer and for +# LogisticRegressionCV. Both of these consumers understand the meaning +# of the key "sample_weight". + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit=['sample_weight']) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# Error handling: if props={'sample_eight': my_weights, ...} was passed, +# cross_validate would raise an error, since 'sample_eight' was not requested +# by any of its children. + +# %% +# Case B: weighted scoring and unweighted fitting + +# Since LogisticRegressionCV requires that weights explicitly be requested, +# removing that request means the fitting is unweighted. + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case C: unweighted feature selection + +# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly. +# Here it does not request them. + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit=['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +# Despite make_scorer and LogisticRegressionCV both expecting a key +# sample_weight, we can use aliases to pass different weights to different +# consumers. + +weighted_acc = make_scorer(accuracy, + request_props={'scoring_weight': 'sample_weight'}) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit='fitting_weight') +cross_validate(lr, X, y, cv=group_cv, + props={ + 'scoring_weight': my_weights, + 'fitting_weight': my_other_weights, + 'groups': my_groups, + }, + scoring=weighted_acc) diff --git a/slep006/proposal.rst b/slep006/proposal.rst index c336e4a..dfcd47a 100644 --- a/slep006/proposal.rst +++ b/slep006/proposal.rst @@ -79,7 +79,7 @@ Other related issues include: :issue:`1574`, :issue:`2630`, :issue:`3524`, :issue:`4632`, :issue:`4652`, :issue:`4660`, :issue:`4696`, :issue:`6322`, :issue:`7112`, :issue:`7646`, :issue:`7723`, :issue:`8127`, :issue:`8158`, :issue:`8710`, :issue:`8950`, :issue:`11429`, :issue:`12052`, :issue:`15282`, -:issues:`15370`, :issue:`15425`. +:issues:`15370`, :issue:`15425`, :issue:`18028`. Desiderata ---------- @@ -368,6 +368,14 @@ Disadvantages: `set_props_request` method (instead of the `request_props` constructor parameter approach) such that all legacy base estimators are automatically equipped. +* Aliasing is a bit confusing in this design, in that the consumer still + accepts the fit param by its original name (e.g. `sample_weight`) even if it + has a request that specifies a different key given to the router (e.g. + `fit_sample_weight`). This design has the advantage that the handling of + props within a consumer is simple and unchanged; the complexity is in + how it is forwarded the data by the router, but it may be conceptually + difficult for users to understand. (This may be acceptable, as an advanced + feature.) * For estimators to be cloned, this request information needs to be cloned with it. This implies one of: the request information be stored as a constructor paramerter; or `clone` is extended to explicitly copy request information. @@ -389,6 +397,22 @@ Test cases: .. literalinclude:: cases_opt4.py +Extensions and alternatives to the syntax considered while working on +:pr:`16079`: + +* `set_prop_request` and `get_props_request` have lists of props requested + **for each method** i.e. fit, score, transform, predict and perhaps others. +* `set_props_request` could be replaced by a method (or parameter) representing + the routing of each prop that it consumes. For example, an estimator that + consumes `sample_weight` would have a `request_sample_weight` method. One of + the difficulties of this approach is automatically introducing + `request_sample_weight` into classes inheriting from BaseEstimator without + too much magic (e.g. meta-classes, which might be the simplest solution). + +These are demonstrated together in the following: + +.. literalinclude:: cases_opt4b.py + Naming ------ From 1eb2642beecf6f00181387495631b86941e6b88e Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 4 Aug 2020 00:23:27 +1000 Subject: [PATCH 3/4] Examples for case 0b --- slep006/cases_opt0a.py | 10 ++--- slep006/cases_opt0b.py | 88 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/slep006/cases_opt0a.py b/slep006/cases_opt0a.py index 24656f4..e94c89e 100644 --- a/slep006/cases_opt0a.py +++ b/slep006/cases_opt0a.py @@ -1,6 +1,6 @@ import numpy as np -from defs import (accuracy, group_cv, make_scorer, SelectKBest, +from defs import (accuracy, group_cv, get_scorer, SelectKBest, LogisticRegressionCV, cross_validate, make_pipeline, X, y, my_groups, my_weights, my_other_weights) @@ -28,7 +28,7 @@ def split(self, X, y, groups=None): def get_n_splits(self, X, y, groups=None): groups = X[:, self.groups_idx] - return self.base_cv.split(unwrap_X(X), y, groups=groups) + return self.base_cv.get_n_splits(unwrap_X(X), y, groups=groups) wrapped_group_cv = WrappedGroupCV(group_cv) @@ -39,11 +39,11 @@ def fit(self, X, y): return super().fit(unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) -weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +acc_scorer = get_scorer('accuracy') def wrapped_weighted_acc(est, X, y, sample_weight=None): - return weighted_acc(est, unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) + return acc_scorer(est, unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) lr = WrappedLogisticRegressionCV( @@ -81,7 +81,7 @@ def fit(self, X, y): lr = WrappedLogisticRegressionCV( cv=wrapped_group_cv, - scoring=weighted_acc, + scoring=wrapped_weighted_acc, ).set_props_request(['sample_weight']) sel = UnweightedWrappedSelectKBest() pipe = make_pipeline(sel, lr) diff --git a/slep006/cases_opt0b.py b/slep006/cases_opt0b.py index f543e9b..6af76cd 100644 --- a/slep006/cases_opt0b.py +++ b/slep006/cases_opt0b.py @@ -1,7 +1,91 @@ import pandas as pd -from defs import (accuracy, group_cv, make_scorer, SelectKBest, +from defs import (accuracy, group_cv, get_scorer, SelectKBest, LogisticRegressionCV, cross_validate, make_pipeline, X, y, my_groups, my_weights, my_other_weights) -# TODO +X = pd.DataFrame(X) +MY_GROUPS = pd.Series(my_groups) +MY_WEIGHTS = pd.Series(my_weights) +MY_OTHER_WEIGHTS = pd.Series(my_other_weights) + +# %% +# Case A: weighted scoring and fitting + + +class WrappedGroupCV: + def __init__(self, base_cv): + self.base_cv = base_cv + + def split(self, X, y, groups=None): + return self.base_cv.split(X, y, groups=MY_GROUPS.loc[X.index]) + + def get_n_splits(self, X, y, groups=None): + return self.base_cv.get_n_splits(X, y, groups=MY_GROUPS.loc[X.index]) + + +wrapped_group_cv = WrappedGroupCV(group_cv) + + +class WeightedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(X, y, sample_weight=MY_WEIGHTS.loc[X.index]) + + +acc_scorer = get_scorer('accuracy') + + +def wrapped_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, X, y, sample_weight=MY_WEIGHTS.loc[X.index]) + + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case B: weighted scoring and unweighted fitting + +lr = LogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + + +# %% +# Case C: unweighted feature selection + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case D: different scoring and fitting weights + + +def other_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, X, y, sample_weight=MY_OTHER_WEIGHTS.loc[X.index]) + + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=other_weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, + cv=wrapped_group_cv, + scoring=other_weighted_acc) From ad7cfb66957467f482640796272c3605468082cd Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 4 Aug 2020 23:03:44 +1000 Subject: [PATCH 4/4] Complete some TODOs --- slep006/proposal.rst | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/slep006/proposal.rst b/slep006/proposal.rst index dfcd47a..1612d27 100644 --- a/slep006/proposal.rst +++ b/slep006/proposal.rst @@ -429,30 +429,49 @@ Proposal Having considered the above solutions, we propose: -TODO +* Solution 4 per :pr:`16079` which will be used to resolve further, specific + details of the solution. +* Props will be known simply as Metadata. +* `**kw` syntax will be used to pass props by key. -* which solution? -* if an estimator requests a prop, must it be not-null? Must it be provided or explicitly passed as None? -* props param or kwargs? -* naming? +TODO: + +* if an estimator requests a prop, must it be not-null? Must it be provided or + explicitly passed as None? Backward compatibility ---------------------- -TODO +Under this proposal, consumer behaviour will be backwards compatible, but +meta-estimators will change their routing behaviour. + +By default, `sample_weight` will not be requested by estimators that support +it. This ensures that addition of `sample_weight` support to an estimator will +not change its behaviour. -TODO: Do we continue to handle sample_weight such that it only gets provided of requested explicitly? Or do we make it requested by default in the future (possibly with a deprecation period)? +During a deprecation period, fit_params will be handled dually: Keys that are +requested will be passed through the new request mechanism, while keys that are +not known will be routed using legacy mechanisms. At completion of the +deprecation period, the legacy handling will cease. -During a deprecation period, fit_params will be handled dually: Keys that are requested will be passed through the new request mechanism, while keys that are not known will be routed using legacy mechanisms. At completion of the deprecation period, the legacy handling will cease. +Similarly, during a deprecation period, `fit_params` in GridSearchCV and +related utilities will be routed to the estimator's `fit` by default, per +incumbent behaviour. After the deprecation period, an error will be raised for +any params not explicitly requested. -Grouped cross validation splitters will request `groups` since they were previously unusable in a nested cross validation context, so this should not often create backwards incompatibilities, except perhaps where a fit param named `groups` served another purpose. +Grouped cross validation splitters will request `groups` since they were +previously unusable in a nested cross validation context, so this should not +often create backwards incompatibilities, except perhaps where a fit param +named `groups` served another purpose. Discussion ---------- -One benefit of the explicitness in Solution 4 is that even if it makes use of **kw arguments, it does not preclude keywords arguments serving other purposes in addition. That is, in addition to requesting sample props, a future proposal could allow estimators to request feature metadata or other keys. - -TODO +One benefit of the explicitness in Solution 4 is that even if it makes use of +`**kw` arguments, it does not preclude keywords arguments serving other +purposes in addition. That is, in addition to requesting sample props, a +future proposal could allow estimators to request feature metadata or other +keys. References and Footnotes ------------------------