diff --git a/slep006/cases_opt0a.py b/slep006/cases_opt0a.py index d0141fa..e94c89e 100644 --- a/slep006/cases_opt0a.py +++ b/slep006/cases_opt0a.py @@ -1,6 +1,98 @@ -from defs import (accuracy, group_cv, make_scorer, SelectKBest, +import numpy as np + +from defs import (accuracy, group_cv, get_scorer, SelectKBest, LogisticRegressionCV, cross_validate, make_pipeline, X, y, my_groups, my_weights, my_other_weights) -# TODO +# %% +# Case A: weighted scoring and fitting + + +GROUPS_IDX = -1 +WEIGHT_IDX = -2 + + +def unwrap_X(X): + return X[:, -2:] + + +class WrappedGroupCV: + def __init__(self, base_cv, groups_idx=GROUPS_IDX): + self.base_cv = base_cv + self.groups_idx = groups_idx + + def split(self, X, y, groups=None): + groups = X[:, self.groups_idx] + return self.base_cv.split(unwrap_X(X), y, groups=groups) + + def get_n_splits(self, X, y, groups=None): + groups = X[:, self.groups_idx] + return self.base_cv.get_n_splits(unwrap_X(X), y, groups=groups) + + +wrapped_group_cv = WrappedGroupCV(group_cv) + + +class WrappedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) + + +acc_scorer = get_scorer('accuracy') + + +def wrapped_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX]) + + +lr = WrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case B: weighted scoring and unweighted fitting + +class UnweightedWrappedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(unwrap_X(X), y) + + +lr = UnweightedWrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + + +# %% +# Case C: unweighted feature selection + +class UnweightedWrappedSelectKBest(SelectKBest): + def fit(self, X, y): + return super().fit(unwrap_X(X), y) + + +lr = WrappedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +sel = UnweightedWrappedSelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, np.hstack([X, my_weights, my_groups]), y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +SCORING_WEIGHT_IDX = -3 + +# TODO: proceed from here. Note that this change implies the need to add +# a parameter to unwrap_X, since we will now append an additional column to X. diff --git a/slep006/cases_opt0b.py b/slep006/cases_opt0b.py index f543e9b..6af76cd 100644 --- a/slep006/cases_opt0b.py +++ b/slep006/cases_opt0b.py @@ -1,7 +1,91 @@ import pandas as pd -from defs import (accuracy, group_cv, make_scorer, SelectKBest, +from defs import (accuracy, group_cv, get_scorer, SelectKBest, LogisticRegressionCV, cross_validate, make_pipeline, X, y, my_groups, my_weights, my_other_weights) -# TODO +X = pd.DataFrame(X) +MY_GROUPS = pd.Series(my_groups) +MY_WEIGHTS = pd.Series(my_weights) +MY_OTHER_WEIGHTS = pd.Series(my_other_weights) + +# %% +# Case A: weighted scoring and fitting + + +class WrappedGroupCV: + def __init__(self, base_cv): + self.base_cv = base_cv + + def split(self, X, y, groups=None): + return self.base_cv.split(X, y, groups=MY_GROUPS.loc[X.index]) + + def get_n_splits(self, X, y, groups=None): + return self.base_cv.get_n_splits(X, y, groups=MY_GROUPS.loc[X.index]) + + +wrapped_group_cv = WrappedGroupCV(group_cv) + + +class WeightedLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y): + return super().fit(X, y, sample_weight=MY_WEIGHTS.loc[X.index]) + + +acc_scorer = get_scorer('accuracy') + + +def wrapped_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, X, y, sample_weight=MY_WEIGHTS.loc[X.index]) + + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case B: weighted scoring and unweighted fitting + +lr = LogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + + +# %% +# Case C: unweighted feature selection + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, + cv=wrapped_group_cv, + scoring=wrapped_weighted_acc) + +# %% +# Case D: different scoring and fitting weights + + +def other_weighted_acc(est, X, y, sample_weight=None): + return acc_scorer(est, X, y, sample_weight=MY_OTHER_WEIGHTS.loc[X.index]) + + +lr = WeightedLogisticRegressionCV( + cv=wrapped_group_cv, + scoring=other_weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, + cv=wrapped_group_cv, + scoring=other_weighted_acc) diff --git a/slep006/cases_opt4b.py b/slep006/cases_opt4b.py new file mode 100644 index 0000000..58aeaa4 --- /dev/null +++ b/slep006/cases_opt4b.py @@ -0,0 +1,78 @@ +from defs import (accuracy, group_cv, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +# Here we presume that GroupKFold requests `groups` by default. +# We need to explicitly request weights in make_scorer and for +# LogisticRegressionCV. Both of these consumers understand the meaning +# of the key "sample_weight". + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit=['sample_weight']) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# Error handling: if props={'sample_eight': my_weights, ...} was passed, +# cross_validate would raise an error, since 'sample_eight' was not requested +# by any of its children. + +# %% +# Case B: weighted scoring and unweighted fitting + +# Since LogisticRegressionCV requires that weights explicitly be requested, +# removing that request means the fitting is unweighted. + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case C: unweighted feature selection + +# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly. +# Here it does not request them. + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit=['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +# Despite make_scorer and LogisticRegressionCV both expecting a key +# sample_weight, we can use aliases to pass different weights to different +# consumers. + +weighted_acc = make_scorer(accuracy, + request_props={'scoring_weight': 'sample_weight'}) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).request_sample_weight(fit='fitting_weight') +cross_validate(lr, X, y, cv=group_cv, + props={ + 'scoring_weight': my_weights, + 'fitting_weight': my_other_weights, + 'groups': my_groups, + }, + scoring=weighted_acc) diff --git a/slep006/proposal.rst b/slep006/proposal.rst index c336e4a..1612d27 100644 --- a/slep006/proposal.rst +++ b/slep006/proposal.rst @@ -79,7 +79,7 @@ Other related issues include: :issue:`1574`, :issue:`2630`, :issue:`3524`, :issue:`4632`, :issue:`4652`, :issue:`4660`, :issue:`4696`, :issue:`6322`, :issue:`7112`, :issue:`7646`, :issue:`7723`, :issue:`8127`, :issue:`8158`, :issue:`8710`, :issue:`8950`, :issue:`11429`, :issue:`12052`, :issue:`15282`, -:issues:`15370`, :issue:`15425`. +:issues:`15370`, :issue:`15425`, :issue:`18028`. Desiderata ---------- @@ -368,6 +368,14 @@ Disadvantages: `set_props_request` method (instead of the `request_props` constructor parameter approach) such that all legacy base estimators are automatically equipped. +* Aliasing is a bit confusing in this design, in that the consumer still + accepts the fit param by its original name (e.g. `sample_weight`) even if it + has a request that specifies a different key given to the router (e.g. + `fit_sample_weight`). This design has the advantage that the handling of + props within a consumer is simple and unchanged; the complexity is in + how it is forwarded the data by the router, but it may be conceptually + difficult for users to understand. (This may be acceptable, as an advanced + feature.) * For estimators to be cloned, this request information needs to be cloned with it. This implies one of: the request information be stored as a constructor paramerter; or `clone` is extended to explicitly copy request information. @@ -389,6 +397,22 @@ Test cases: .. literalinclude:: cases_opt4.py +Extensions and alternatives to the syntax considered while working on +:pr:`16079`: + +* `set_prop_request` and `get_props_request` have lists of props requested + **for each method** i.e. fit, score, transform, predict and perhaps others. +* `set_props_request` could be replaced by a method (or parameter) representing + the routing of each prop that it consumes. For example, an estimator that + consumes `sample_weight` would have a `request_sample_weight` method. One of + the difficulties of this approach is automatically introducing + `request_sample_weight` into classes inheriting from BaseEstimator without + too much magic (e.g. meta-classes, which might be the simplest solution). + +These are demonstrated together in the following: + +.. literalinclude:: cases_opt4b.py + Naming ------ @@ -405,30 +429,49 @@ Proposal Having considered the above solutions, we propose: -TODO +* Solution 4 per :pr:`16079` which will be used to resolve further, specific + details of the solution. +* Props will be known simply as Metadata. +* `**kw` syntax will be used to pass props by key. -* which solution? -* if an estimator requests a prop, must it be not-null? Must it be provided or explicitly passed as None? -* props param or kwargs? -* naming? +TODO: + +* if an estimator requests a prop, must it be not-null? Must it be provided or + explicitly passed as None? Backward compatibility ---------------------- -TODO +Under this proposal, consumer behaviour will be backwards compatible, but +meta-estimators will change their routing behaviour. + +By default, `sample_weight` will not be requested by estimators that support +it. This ensures that addition of `sample_weight` support to an estimator will +not change its behaviour. -TODO: Do we continue to handle sample_weight such that it only gets provided of requested explicitly? Or do we make it requested by default in the future (possibly with a deprecation period)? +During a deprecation period, fit_params will be handled dually: Keys that are +requested will be passed through the new request mechanism, while keys that are +not known will be routed using legacy mechanisms. At completion of the +deprecation period, the legacy handling will cease. -During a deprecation period, fit_params will be handled dually: Keys that are requested will be passed through the new request mechanism, while keys that are not known will be routed using legacy mechanisms. At completion of the deprecation period, the legacy handling will cease. +Similarly, during a deprecation period, `fit_params` in GridSearchCV and +related utilities will be routed to the estimator's `fit` by default, per +incumbent behaviour. After the deprecation period, an error will be raised for +any params not explicitly requested. -Grouped cross validation splitters will request `groups` since they were previously unusable in a nested cross validation context, so this should not often create backwards incompatibilities, except perhaps where a fit param named `groups` served another purpose. +Grouped cross validation splitters will request `groups` since they were +previously unusable in a nested cross validation context, so this should not +often create backwards incompatibilities, except perhaps where a fit param +named `groups` served another purpose. Discussion ---------- -One benefit of the explicitness in Solution 4 is that even if it makes use of **kw arguments, it does not preclude keywords arguments serving other purposes in addition. That is, in addition to requesting sample props, a future proposal could allow estimators to request feature metadata or other keys. - -TODO +One benefit of the explicitness in Solution 4 is that even if it makes use of +`**kw` arguments, it does not preclude keywords arguments serving other +purposes in addition. That is, in addition to requesting sample props, a +future proposal could allow estimators to request feature metadata or other +keys. References and Footnotes ------------------------