@@ -294,17 +294,15 @@ of a house that is 2,000 square feet.
294
294
```
295
295
296
296
``` {code-cell} ipython3
297
- nearest_neighbors = (
298
- small_sacramento.assign(diff=(2000 - small_sacramento["sqft"]).abs())
299
- .nsmallest(5, "diff")
300
- )
301
-
302
- nearest_neighbors
297
+ small_sacramento["dist"] = (2000 - small_sacramento["sqft"]).abs()
298
+ small_sacramento.nsmallest(5, "dist")
303
299
```
304
300
305
301
``` {code-cell} ipython3
306
302
:tags: [remove-cell]
307
303
304
+ nearest_neighbors = small_sacramento.nsmallest(5, "dist")
305
+
308
306
nn_plot = small_plot + rule
309
307
310
308
# plot horizontal lines which is perpendicular to x=2000
@@ -609,16 +607,15 @@ sacr_gridsearch.fit(
609
607
)
610
608
611
609
# Retrieve the CV scores
612
- sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)[[
613
- "param_kneighborsregressor__n_neighbors",
614
- "mean_test_score",
615
- "std_test_score"
616
- ]]
610
+ sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)
611
+ sacr_results["sem_test_score"] = sacr_results["std_test_score"] / 5**(1/2)
617
612
sacr_results = (
618
- sacr_results
619
- .assign(sem_test_score=sacr_results["std_test_score"] / 5**(1/2))
613
+ sacr_results[[
614
+ "param_kneighborsregressor__n_neighbors",
615
+ "mean_test_score",
616
+ "sem_test_score"
617
+ ]]
620
618
.rename(columns={"param_kneighborsregressor__n_neighbors": "n_neighbors"})
621
- .drop(columns=["std_test_score"])
622
619
)
623
620
sacr_results
624
621
```
@@ -834,12 +831,10 @@ model uses a different default scoring metric than the RMSPE.
834
831
``` {code-cell} ipython3
835
832
from sklearn.metrics import mean_squared_error
836
833
837
- sacr_preds = sacramento_test.assign(
838
- predicted = sacr_gridsearch.predict(sacramento_test)
839
- )
834
+ sacramento_test["predicted"] = sacr_gridsearch.predict(sacramento_test)
840
835
RMSPE = mean_squared_error(
841
- y_true = sacr_preds ["price"],
842
- y_pred=sacr_preds ["predicted"]
836
+ y_true = sacramento_test ["price"],
837
+ y_pred = sacramento_test ["predicted"]
843
838
)**(1/2)
844
839
RMSPE
845
840
```
@@ -890,9 +885,7 @@ sqft_prediction_grid = pd.DataFrame({
890
885
"sqft": np.arange(sacramento["sqft"].min(), sacramento["sqft"].max(), 10)
891
886
})
892
887
# Predict the price for each of the sqft values in the grid
893
- sacr_preds = sqft_prediction_grid.assign(
894
- predicted = sacr_gridsearch.predict(sqft_prediction_grid)
895
- )
888
+ sqft_prediction_grid["predicted"] = sacr_gridsearch.predict(sqft_prediction_grid)
896
889
897
890
# Plot all the houses
898
891
base_plot = alt.Chart(sacramento).mark_circle(opacity=0.4).encode(
@@ -905,11 +898,14 @@ base_plot = alt.Chart(sacramento).mark_circle(opacity=0.4).encode(
905
898
)
906
899
907
900
# Add the predictions as a line
908
- sacr_preds_plot = base_plot + alt.Chart(sacr_preds, title=f"K = {best_k_sacr}").mark_line(
909
- color="#ff7f0e"
901
+ sacr_preds_plot = base_plot + alt.Chart(
902
+ sqft_prediction_grid,
903
+ title=f"K = {best_k_sacr}"
904
+ ).mark_line(
905
+ color="#ff7f0e"
910
906
).encode(
911
- x="sqft",
912
- y="predicted"
907
+ x="sqft",
908
+ y="predicted"
913
909
)
914
910
915
911
sacr_preds_plot
@@ -1018,25 +1014,24 @@ sacr_gridsearch = GridSearchCV(
1018
1014
cv=5,
1019
1015
scoring="neg_root_mean_squared_error"
1020
1016
)
1017
+
1021
1018
sacr_gridsearch.fit(
1022
1019
sacramento_train[["sqft", "beds"]],
1023
1020
sacramento_train["price"]
1024
1021
)
1025
1022
1026
1023
# retrieve the CV scores
1027
- sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)[[
1028
- "param_kneighborsregressor__n_neighbors",
1029
- "mean_test_score",
1030
- "std_test_score"
1031
- ]]
1032
-
1024
+ sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)
1025
+ sacr_results["sem_test_score"] = sacr_results["std_test_score"] / 5**(1/2)
1026
+ sacr_results["mean_test_score"] = -sacr_results["mean_test_score"]
1033
1027
sacr_results = (
1034
- sacr_results
1035
- .assign(sem_test_score=sacr_results["std_test_score"] / 5**(1/2))
1036
- .rename(columns={"param_kneighborsregressor__n_neighbors" : "n_neighbors"})
1037
- .drop(columns=["std_test_score"])
1028
+ sacr_results[[
1029
+ "param_kneighborsregressor__n_neighbors",
1030
+ "mean_test_score",
1031
+ "sem_test_score"
1032
+ ]]
1033
+ .rename(columns={"param_kneighborsregressor__n_neighbors" : "n_neighbors"})
1038
1034
)
1039
- sacr_results["mean_test_score"] = -sacr_results["mean_test_score"]
1040
1035
1041
1036
# show only the row of minimum RMSPE
1042
1037
sacr_results.nsmallest(1, "mean_test_score")
@@ -1069,12 +1064,10 @@ via the `predict` method of the fit `GridSearchCV` object. Finally, we will use
1069
1064
to compute the RMSPE.
1070
1065
1071
1066
``` {code-cell} ipython3
1072
- sacr_preds = sacramento_test.assign(
1073
- predicted = sacr_gridsearch.predict(sacramento_test)
1074
- )
1067
+ sacramento_test["predicted"] = sacr_gridsearch.predict(sacramento_test)
1075
1068
RMSPE_mult = mean_squared_error(
1076
- y_true = sacr_preds ["price"],
1077
- y_pred=sacr_preds ["predicted"]
1069
+ y_true = sacramento_test ["price"],
1070
+ y_pred = sacramento_test ["predicted"]
1078
1071
)**(1/2)
1079
1072
RMSPE_mult
1080
1073
0 commit comments