Skip to content

Commit bd2fdae

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 0d8a04bd17e3393df271845e663698e2f48354bf
1 parent 25ef25d commit bd2fdae

File tree

1,078 files changed

+3458
-3352
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,078 files changed

+3458
-3352
lines changed
-3 Bytes
Binary file not shown.
-4 Bytes
Binary file not shown.

dev/_downloads/column_transformer_mixed_types.ipynb renamed to dev/_downloads/plot_column_transformer_mixed_types.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\n\nfrom sklearn.compose import make_column_transformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\nnumeric_features = ['age', 'fare']\ncategorical_features = ['embarked', 'sex', 'pclass']\n\n# Provisionally, use pd.fillna() to impute missing values for categorical\n# features; SimpleImputer will eventually support strategy=\"constant\".\ndata[categorical_features] = data[categorical_features].fillna(value='missing')\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())\ncategorical_transformer = CategoricalEncoder('onehot-dense',\n handle_unknown='ignore')\n\npreprocessing_pl = make_column_transformer(\n (numeric_features, numeric_transformer),\n (categorical_features, categorical_transformer),\n remainder='drop'\n)\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = make_pipeline(preprocessing_pl, LogisticRegression())\n\nX = data.drop('survived', axis=1)\ny = data.survived.values\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %f\" % clf.score(X_test, y_test))"
29+
"# Author: Pedro Morales <[email protected]>\n#\n# License: BSD 3 clause\n\nfrom __future__ import print_function\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, CategoricalEncoder\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split, GridSearchCV\n\nnp.random.seed(0)\n\n# Read data from Titanic dataset.\ntitanic_url = ('https://raw.githubusercontent.com/amueller/'\n 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')\ndata = pd.read_csv(titanic_url)\n\n# We will train our classifier with the following features:\n# Numeric Features:\n# - age: float.\n# - fare: float.\n# Categorical Features:\n# - embarked: categories encoded as strings {'C', 'S', 'Q'}.\n# - sex: categories encoded as strings {'female', 'male'}.\n# - pclass: ordinal integers {1, 2, 3}.\n\n# We create the preprocessing pipelines for both numeric and categorical data.\nnumeric_features = ['age', 'fare']\nnumeric_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='median')),\n ('scaler', StandardScaler())])\n\ncategorical_features = ['embarked', 'sex', 'pclass']\ncategorical_transformer = Pipeline(steps=[\n ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])\n\npreprocessor = ColumnTransformer(\n transformers=[\n ('num', numeric_transformer, numeric_features),\n ('cat', categorical_transformer, categorical_features)],\n remainder='drop')\n\n# Append classifier to preprocessing pipeline.\n# Now we have a full prediction pipeline.\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n ('classifier', LogisticRegression())])\n\nX = data.drop('survived', axis=1)\ny = data['survived']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n shuffle=True)\n\nclf.fit(X_train, y_train)\nprint(\"model score: %.3f\" % clf.score(X_test, y_test))"
3030
]
3131
},
3232
{
@@ -44,7 +44,7 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"param_grid = {\n 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],\n 'logisticregression__C': [0.1, 1.0, 1.0],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %f\"\n % grid_search.score(X_test, y_test)))"
47+
"param_grid = {\n 'preprocessor__num__imputer__strategy': ['mean', 'median'],\n 'classifier__C': [0.1, 1.0, 10, 100],\n}\n\ngrid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)\ngrid_search.fit(X_train, y_train)\n\nprint((\"best logistic regression from grid search: %.3f\"\n % grid_search.score(X_test, y_test)))"
4848
]
4949
}
5050
],

dev/_downloads/column_transformer_mixed_types.py renamed to dev/_downloads/plot_column_transformer_mixed_types.py

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,16 @@
2727
from __future__ import print_function
2828

2929
import pandas as pd
30+
import numpy as np
3031

31-
from sklearn.compose import make_column_transformer
32-
from sklearn.pipeline import make_pipeline
32+
from sklearn.compose import ColumnTransformer
33+
from sklearn.pipeline import Pipeline
3334
from sklearn.impute import SimpleImputer
3435
from sklearn.preprocessing import StandardScaler, CategoricalEncoder
3536
from sklearn.linear_model import LogisticRegression
3637
from sklearn.model_selection import train_test_split, GridSearchCV
3738

39+
np.random.seed(0)
3840

3941
# Read data from Titanic dataset.
4042
titanic_url = ('https://raw.githubusercontent.com/amueller/'
@@ -49,36 +51,37 @@
4951
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
5052
# - sex: categories encoded as strings {'female', 'male'}.
5153
# - pclass: ordinal integers {1, 2, 3}.
52-
numeric_features = ['age', 'fare']
53-
categorical_features = ['embarked', 'sex', 'pclass']
54-
55-
# Provisionally, use pd.fillna() to impute missing values for categorical
56-
# features; SimpleImputer will eventually support strategy="constant".
57-
data[categorical_features] = data[categorical_features].fillna(value='missing')
5854

5955
# We create the preprocessing pipelines for both numeric and categorical data.
60-
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
61-
categorical_transformer = CategoricalEncoder('onehot-dense',
62-
handle_unknown='ignore')
56+
numeric_features = ['age', 'fare']
57+
numeric_transformer = Pipeline(steps=[
58+
('imputer', SimpleImputer(strategy='median')),
59+
('scaler', StandardScaler())])
60+
61+
categorical_features = ['embarked', 'sex', 'pclass']
62+
categorical_transformer = Pipeline(steps=[
63+
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
64+
('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])
6365

64-
preprocessing_pl = make_column_transformer(
65-
(numeric_features, numeric_transformer),
66-
(categorical_features, categorical_transformer),
67-
remainder='drop'
68-
)
66+
preprocessor = ColumnTransformer(
67+
transformers=[
68+
('num', numeric_transformer, numeric_features),
69+
('cat', categorical_transformer, categorical_features)],
70+
remainder='drop')
6971

7072
# Append classifier to preprocessing pipeline.
7173
# Now we have a full prediction pipeline.
72-
clf = make_pipeline(preprocessing_pl, LogisticRegression())
74+
clf = Pipeline(steps=[('preprocessor', preprocessor),
75+
('classifier', LogisticRegression())])
7376

7477
X = data.drop('survived', axis=1)
75-
y = data.survived.values
78+
y = data['survived']
7679

7780
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
7881
shuffle=True)
7982

8083
clf.fit(X_train, y_train)
81-
print("model score: %f" % clf.score(X_test, y_test))
84+
print("model score: %.3f" % clf.score(X_test, y_test))
8285

8386

8487
###############################################################################
@@ -93,12 +96,12 @@
9396

9497

9598
param_grid = {
96-
'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
97-
'logisticregression__C': [0.1, 1.0, 1.0],
99+
'preprocessor__num__imputer__strategy': ['mean', 'median'],
100+
'classifier__C': [0.1, 1.0, 10, 100],
98101
}
99102

100103
grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
101104
grid_search.fit(X_train, y_train)
102105

103-
print(("best logistic regression from grid search: %f"
106+
print(("best logistic regression from grid search: %.3f"
104107
% grid_search.score(X_test, y_test)))

dev/_downloads/scikit-learn-docs.pdf

-66.1 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes

0 commit comments

Comments
 (0)