Carseats
data set after converting Sales
into a qualitative response variable. Now we will seek to predict Sales
using regression trees and related approaches, treating the response as a quantitative variable.
library(tree)
library(ISLR)
Carseats = na.omit(Carseats)
calculate_mse = function(model, dataset) {
actual = dataset$Sales
predicted = predict(model, newdata=dataset)
mse = mean((actual-predicted)^2)
return (mse)
}
n = nrow(Carseats)
set.seed(1)
idx = sample(n, n/2)
training = Carseats[idx,]
test = Carseats[-idx,]
tree = tree(Sales ~ ., data = training)
plot(tree, col="grey")
text(tree, pretty=0, col="red")
mse = calculate_mse(tree, test)
mse
[1] 4.148897
predicted = predict(tree, newdata=test)
raw.data = data.frame(predicted, test)
library(knitr)
kable(head(raw.data))
predicted | Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 9.116667 | 9.50 | 138 | 73 | 11 | 276 | 120 | Bad | 42 | 17 | Yes | Yes |
2 | 12.082000 | 11.22 | 111 | 48 | 16 | 260 | 83 | Good | 65 | 10 | Yes | Yes |
3 | 7.512222 | 10.06 | 113 | 35 | 10 | 269 | 80 | Medium | 59 | 12 | Yes | Yes |
5 | 4.627200 | 4.15 | 141 | 64 | 3 | 340 | 128 | Bad | 38 | 13 | Yes | No |
7 | 7.130769 | 6.63 | 115 | 105 | 0 | 45 | 108 | Medium | 71 | 15 | Yes | No |
8 | 11.014000 | 11.85 | 136 | 81 | 15 | 425 | 120 | Good | 67 | 10 | Yes | Yes |
sd = sqrt(mse)
sd1 = ifelse((raw.data$Sales > raw.data$predicted-sd) & (raw.data$Sales < raw.data$predicted+sd), 1, 0)
sum(sd1) / nrow(raw.data)
[1] 0.68
sd2 = ifelse((raw.data$Sales > raw.data$predicted-sd*2) & (raw.data$Sales < raw.data$predicted+sd*2), 1, 0)
sum(sd2) / nrow(raw.data)
[1] 0.945
cv = cv.tree(tree)
plot(cv$size, cv$dev/nrow(Carseats), type="b", pch=19, col="#562435",
xlab="Tree Size (# Terminal Nodes)", ylab="MSE",
main="Cross Validation: Effect of Tree Size on MSE", ylim=c(2.6,2.8))
pruned.tree = prune.tree(tree, best=9)
pruned.mse = calculate_mse(pruned.tree, test)
pruned.mse
[1] 4.993124
pruned.test.mse = rep(0,17)
for (i in 2:18) {
pruned.tree = prune.tree(tree, best=i)
pruned.mse = calculate_mse(pruned.tree, test)
pruned.test.mse[i] = pruned.mse
}
plot(pruned.test.mse, type="b", pch=19, col="#562435",
xlab="Pruned Tree Size (# Terminal Nodes)", ylab="Test MSE",
main="Effect of Pruning on Test MSE", xlim=c(3,18), ylim=c(4,6))
importance()
function to determine which variables are most important.
library(randomForest)
set.seed(3)
bagged.tree = randomForest(Sales ~ ., data=training, importance=TRUE, mtry=11)
bagged.mse = calculate_mse(bagged.tree, test)
bagged.mse
[1] 2.55751
importance()
function to identify the key predictor variables:
importance(bagged.tree, type=1)
%IncMSE
CompPrice 14.3725974
Income 5.9039670
Advertising 14.2546456
Population 0.5521805
Price 61.0815445
ShelveLoc 46.5308006
Age 20.9071380
Education 1.3675478
Urban -2.7169457
US 6.8176330
values ind
1 61.0815445 Price
2 46.5308006 ShelveLoc
3 20.9071380 Age
4 14.3725974 CompPrice
5 14.2546456 Advertising
6 6.8176330 US
7 5.9039670 Income
8 1.3675478 Education
9 0.5521805 Population
10 -2.7169457 Urban
importance()
function to determine which variables are most important. Describe the effect of m
, the number of variables considered at each split, on the test MSE obtained.
random.forest.mse = rep(0, 11)
for (i in 1:11) {
set.seed(3)
random.forest = randomForest(Sales ~ ., data=training, importance=TRUE, mtry=i)
random.forest.mse[i] = calculate_mse(random.forest, test)
}
plot(random.forest.mse,
type="b",
pch=19,
col="#562435",
xlab="Number of Predictors",
ylab="MSE",
main="Random Forest Results"
,ylim=c(2.5, 3.0)
)
set.seed(3)
random.forest.9 = randomForest(Sales ~ ., data=training, mtry=9, importance=TRUE)
random.forest.9.mse = calculate_mse(random.forest, test)
random.forest.9.mse
[1] 2.55751
stack(sort(importance(random.forest.9, type=1)[,1], decreasing=TRUE))
values ind
1 56.8881626 Price
2 43.7826458 ShelveLoc
3 21.1565415 Age
4 14.8210193 Advertising
5 13.9965137 CompPrice
6 5.9415088 US
7 5.2824015 Income
8 1.3512694 Education
9 0.5676259 Population
10 -2.9272455 Urban
OJ
data set which is part of the ISLR
package.
library(ISLR)
OJ = na.omit(OJ)
n = nrow(OJ)
set.seed(1)
idx = sample(n, 800)
training = OJ[idx,]
test = OJ[-idx,]
Purchase
as the response and the other variables as predictors. Use the summary()
function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?
library(tree)
tree = tree(Purchase ~ ., data=training)
summary(tree)
Classification tree:
tree(formula = Purchase ~ ., data = training)
Variables actually used in tree construction:
[1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
Number of terminal nodes: 8
Residual mean deviance: 0.7305 = 578.6 / 792
Misclassification error rate: 0.165 = 132 / 800
tree
node), split, n, deviance, yval, (yprob)
* denotes terminal node
1) root 800 1064.00 CH ( 0.61750 0.38250 )
2) LoyalCH < 0.508643 350 409.30 MM ( 0.27143 0.72857 )
4) LoyalCH < 0.264232 166 122.10 MM ( 0.12048 0.87952 )
8) LoyalCH < 0.0356415 57 10.07 MM ( 0.01754 0.98246 ) *
9) LoyalCH > 0.0356415 109 100.90 MM ( 0.17431 0.82569 ) *
5) LoyalCH > 0.264232 184 248.80 MM ( 0.40761 0.59239 )
10) PriceDiff < 0.195 83 91.66 MM ( 0.24096 0.75904 )
20) SpecialCH < 0.5 70 60.89 MM ( 0.15714 0.84286 ) *
21) SpecialCH > 0.5 13 16.05 CH ( 0.69231 0.30769 ) *
11) PriceDiff > 0.195 101 139.20 CH ( 0.54455 0.45545 ) *
3) LoyalCH > 0.508643 450 318.10 CH ( 0.88667 0.11333 )
6) LoyalCH < 0.764572 172 188.90 CH ( 0.76163 0.23837 )
12) ListPriceDiff < 0.235 70 95.61 CH ( 0.57143 0.42857 ) *
13) ListPriceDiff > 0.235 102 69.76 CH ( 0.89216 0.10784 ) *
7) LoyalCH > 0.764572 278 86.14 CH ( 0.96403 0.03597 ) *
plot(tree, col="grey")
text(tree, pretty=0, col="red")
actual = test$Purchase
predicted = ifelse(predict(tree, newdata=test)[,1] <= 0.5, "MM", "CH")
confusion.matrix = table(actual, predicted)
confusion.matrix
predicted
actual CH MM
CH 147 12
MM 49 62
test.error.rate = (confusion.matrix[1,2] + confusion.matrix[2,1]) / nrow(test)
test.error.rate
[1] 0.2259259
cv.tree()
function to the training set in order to determine the optimal tree size.
cv = cv.tree(tree, FUN=prune.misclass)
cv
$size
[1] 8 5 2 1
$dev
[1] 156 156 156 306
$k
[1] -Inf 0.000000 4.666667 160.000000
$method
[1] "misclass"
attr(,"class")
[1] "prune" "tree.sequence"
plot(cv$size, (cv$dev / nrow(training) * 100),
type="b",
pch=19,
col="#562435",
xlab="Tree Size (# Terminal Nodes)",
ylab="Error Rate (%)",
main="Cross-Validation Classification Error Rates for Different Sized Trees"
)
pruned.tree = prune.misclass(tree, best=2)
summary(pruned.tree)
Classification tree:
snip.tree(tree = tree, nodes = c(3L, 2L))
Variables actually used in tree construction:
[1] "LoyalCH"
Number of terminal nodes: 2
Residual mean deviance: 0.9115 = 727.4 / 798
Misclassification error rate: 0.1825 = 146 / 800
actual = test$Purchase
predicted = ifelse(predict(pruned.tree, newdata=test)[,1] <= 0.5, "MM", "CH")
confusion.matrix = table(actual, predicted)
confusion.matrix
predicted
actual CH MM
CH 119 40
MM 30 81
pruned.test.error.rate = (confusion.matrix[1,2] + confusion.matrix[2,1]) / nrow(test)
pruned.test.error.rate
[1] 0.2592593
Auto
data set.
library(e1071)
library(ISLR)
Auto = na.omit(Auto)
mpghigh = ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto$mpghigh = as.factor(mpghigh)
cost
, in order to predict whether a car gets high or low gas mileage. Report the cross-validation errors associated with different values on this parmeter. Comment on your results.
set.seed(1)
tune.linear = tune(svm, mpghigh~., data=Auto, kernel="linear", ranges=list(cost=c(0.01, 0.1, 1, 10, 100)))
linear = summary(tune.linear)
linear
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost
1
- best performance: 0.01275641
- Detailed performance results:
cost error dispersion
1 1e-02 0.07403846 0.05471525
2 1e-01 0.03826923 0.05148114
3 1e+00 0.01275641 0.01344780
4 1e+01 0.02038462 0.01074682
5 1e+02 0.03820513 0.01773427
gamma
and degree
and cost
. Comment on your results.
set.seed(1)
tune.poly = tune(svm, mpghigh~., data=Auto, kernel="polynomial", ranges=list(cost=c(0.01, 0.1, 1, 10, 100), degree=c(2,3,4)))
poly = summary(tune.poly)
poly
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost degree
100 2
- best performance: 0.3013462
- Detailed performance results:
cost degree error dispersion
1 1e-02 2 0.5611538 0.04344202
2 1e-01 2 0.5611538 0.04344202
3 1e+00 2 0.5611538 0.04344202
4 1e+01 2 0.5382051 0.05829238
5 1e+02 2 0.3013462 0.09040277
6 1e-02 3 0.5611538 0.04344202
7 1e-01 3 0.5611538 0.04344202
8 1e+00 3 0.5611538 0.04344202
9 1e+01 3 0.5611538 0.04344202
10 1e+02 3 0.3322436 0.11140578
11 1e-02 4 0.5611538 0.04344202
12 1e-01 4 0.5611538 0.04344202
13 1e+00 4 0.5611538 0.04344202
14 1e+01 4 0.5611538 0.04344202
15 1e+02 4 0.5611538 0.04344202
The above results show the lowest cross-validation error occurs at a cost of 100 with a polynomial of degree 2. However, compared to the results in (b), the cross-validation error of this ‘best’ polynomial SVM is approx. 24x that of the linear SVM (30.135% vs. 1.276%).
set.seed(1)
tune.radial = tune(svm, mpghigh~., data=Auto, kernel="radial", ranges=list(cost=c(0.01, 0.1, 1, 10, 100), gamma=c(0.01,0.1,1,10)))
radial = summary(tune.radial)
radial
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost gamma
100 0.01
- best performance: 0.01532051
- Detailed performance results:
cost gamma error dispersion
1 1e-02 0.01 0.56115385 0.04344202
2 1e-01 0.01 0.09185897 0.03862507
3 1e+00 0.01 0.07147436 0.05103685
4 1e+01 0.01 0.02551282 0.03812986
5 1e+02 0.01 0.01532051 0.01788871
6 1e-02 0.10 0.19153846 0.07612945
7 1e-01 0.10 0.07916667 0.05201159
8 1e+00 0.10 0.05608974 0.05092939
9 1e+01 0.10 0.02551282 0.02076457
10 1e+02 0.10 0.02807692 0.01458261
11 1e-02 1.00 0.56115385 0.04344202
12 1e-01 1.00 0.56115385 0.04344202
13 1e+00 1.00 0.06634615 0.06187383
14 1e+01 1.00 0.06128205 0.06186124
15 1e+02 1.00 0.06128205 0.06186124
16 1e-02 10.00 0.56115385 0.04344202
17 1e-01 10.00 0.56115385 0.04344202
18 1e+00 10.00 0.51775641 0.04471079
19 1e+01 10.00 0.51012821 0.03817175
20 1e+02 10.00 0.51012821 0.03817175
plot(linear$performances[,c("cost","error")], type="l", ylim=c(0,0.6), main="CV results for different SVMs")
lines(poly$performances[poly$performances["degree"]==2, c("cost", "error")], col=2)
lines(poly$performances[poly$performances["degree"]==3, c("cost", "error")], col=3)
lines(poly$performances[poly$performances["degree"]==4, c("cost", "error")], col=4)
lines(radial$performances[radial$performances["gamma"]==0.01, c("cost", "error")], col=5)
lines(radial$performances[radial$performances["gamma"]==0.1 , c("cost", "error")], col=6)
lines(radial$performances[radial$performances["gamma"]==1 , c("cost", "error")], col=7)
lines(radial$performances[radial$performances["gamma"]==10 , c("cost", "error")], col=8)
lines(radial$performances[radial$performances["gamma"]==100 , c("cost", "error")], col=9)
legend(5, 0.5,
c("Linear", "Polynomial (degree=2)", "Polynomial (degree=3)", "Polynomial (degree=4)","Radial (gamma=0.01)","Radial (gamma=0.1)","Radial (gamma=1)","Radial (gamma=10)","Radial (gamma=100)"),
lty=rep("solid", 9),
col=c(1,2,3,4,5,6,7,8,9),
bty="n"
)
plot()
function for svm
objects only in cases with \(p\) = 2. When \(p\) > 2 you can use the plot()
function to create plots displaying pairs of variables at a time. Essentially, instead of typing plot(svmfit, dat)
where svmfit
contains your fitted model and dat
is a data frame containing your data, you can type plot(svmfit, at, x1 ~ x4)
in order to plot just the first and fourth variables. However, you must replace x1
and x4
with the correct variable names. To find out more type ?plot.svm
.
X1 = c(3,2,4,1,2,4,4)
X2 = c(4,2,4,4,1,3,1)
Y = c("Red", "Red", "Red", "Red", "Blue", "Blue", "Blue")
plot(X1, X2, col=Y, pch=19)
df = data.frame(X1, X2, y=as.factor(Y))
library(e1071)
mmc.fit = svm(y~., data=df, scale=FALSE, kernel="linear", cost=10)
beta = round(t(mmc.fit$coefs) %*% as.matrix(df[mmc.fit$index,1:2]),3)
beta0 = round(mmc.fit$rho, 3)
mmc.intercept = beta0 / beta[2]
mmc.gradient = -beta[1] / beta[2]
plot(X1, X2, col=Y, pch=19)
abline(mmc.intercept, mmc.gradient, lty=5)
legend(1, 4,
c("Maximal Margin Hyperplane"),
lty=c("dashed"),
lwd=c(2),
bty="n"
)
The hyperplane has the equation:
which, rearranging, gives us the equation in (\(X_1,X_2\)) space:
plot(X1, X2, col=Y, pch=19)
abline(mmc.intercept, mmc.gradient, col="lightgrey")
abline((beta0-1)/beta[2], mmc.gradient, lty=5)
abline((beta0+1)/beta[2], mmc.gradient, lty=5)
legend(1, 4,
c("Maximal Margin Hyperplane", "Margin"),
lty=c("solid", "dashed"),
lwd=c(1, 2),
col=c("lightgrey", "black", "red"),
bty="n"
)
plot(X1, X2, col=Y, pch=19)
abline(mmc.intercept, mmc.gradient, col="lightgrey")
abline((beta0-1)/beta[2], mmc.gradient, lty=5, col="lightgrey")
abline((beta0+1)/beta[2], mmc.gradient, lty=5, col="lightgrey")
points(df[mmc.fit$index,], pch=1, cex=3, col=df[mmc.fit$index,3])
legend(1, 4,
c("Maximal Margin Hyperplane", "Margin", "Support Vector"),
lty=c("solid", "dashed", NA),
lwd=c(1, 1, 1),
pch=c(NA,NA,1),
col=c("lightgrey", "lightgrey", "black"),
bty="n"
)
df[mmc.fit$index,1:2]
X1 X2
2 2 2
3 4 4
6 4 3
plot(X1, X2, col=Y, pch=19)
abline(mmc.intercept, mmc.gradient, col="lightgrey")
abline((beta0-1)/beta[2], mmc.gradient, lty=5, col="lightgrey")
abline((beta0+1)/beta[2], mmc.gradient, lty=5, col="lightgrey")
points(df[mmc.fit$index,], pch=1, cex=3, col="lightgrey")
so.mmc.fit = svm(y~., data=df, scale=FALSE, kernel="linear", cost=1)
so.beta = round(t(so.mmc.fit$coefs) %*% as.matrix(df[so.mmc.fit$index,1:2]),3)
so.beta0 = round(so.mmc.fit$rho, 3)
so.mmc.intercept = round(so.beta0 / so.beta[2], 3)
so.mmc.gradient = round(-so.beta[1] / so.beta[2], 3)
abline(so.mmc.intercept, so.mmc.gradient)
abline((so.beta0-1)/so.beta[2], so.mmc.gradient, lty=5)
abline((so.beta0+1)/so.beta[2], so.mmc.gradient, lty=5)
points(df[so.mmc.fit$index,], pch=1, cex=3, col=df[so.mmc.fit$index,3])
legend(1, 4,
c("Maximal Margin Hyperplane", "Margin", "Support Vector", "Sub-optimal Maximal Margin Hyperplane", "Sub-optimal Margin", "Sub-optimal Support Vector"),
lty=c("solid", "dashed", NA, "solid", "dashed", NA),
lwd=c(1, 1, NA, 1, 1, NA),
pch=c(NA, NA, 1, NA, NA, 1),
col=c("lightgrey", "lightgrey", "lightgrey", "black", "black", "black"),
bty="n"
)
The above hyperplane is sub-optimal because (a) it relies on more support vectors (4 vs. 3 previously), and (b) it has a wider margin so that some observations fall within the margin.
The sub-optimal hyperplane has the equation:
which, rearranging, gives us the equation in (\(X_1,X_2\)) space:
plot(X1, X2, col=Y, pch=19)
points(4, 2, pch=19, col="red")
points(4, 2, pch=1, cex=3, col="red")
USArrests
data. We will now perform hierarchical clustering on the states.
library(ISLR)
set.seed(1)
hc.complete = hclust(dist(USArrests), method="complete")
plot(hc.complete)
hc.cut = cutree(hc.complete, 3)
Cluster #1:
subset(names(hc.cut), hc.cut==1)
[1] "Alabama" "Alaska" "Arizona" "California"
[5] "Delaware" "Florida" "Illinois" "Louisiana"
[9] "Maryland" "Michigan" "Mississippi" "Nevada"
[13] "New Mexico" "New York" "North Carolina" "South Carolina"
Cluster #2:
subset(names(hc.cut), hc.cut==2)
[1] "Arkansas" "Colorado" "Georgia" "Massachusetts"
[5] "Missouri" "New Jersey" "Oklahoma" "Oregon"
[9] "Rhode Island" "Tennessee" "Texas" "Virginia"
[13] "Washington" "Wyoming"
Cluster #3:
subset(names(hc.cut), hc.cut==3)
[1] "Connecticut" "Hawaii" "Idaho" "Indiana"
[5] "Iowa" "Kansas" "Kentucky" "Maine"
[9] "Minnesota" "Montana" "Nebraska" "New Hampshire"
[13] "North Dakota" "Ohio" "Pennsylvania" "South Dakota"
[17] "Utah" "Vermont" "West Virginia" "Wisconsin"
hc.complete.scaled = hclust(dist(scale(USArrests)), method="complete")
plot(hc.complete.scaled)
hc.cut.scaled = cutree(hc.complete.scaled, 3)
Cluster #1:
subset(names(hc.cut.scaled), hc.cut.scaled==1)
[1] "Alabama" "Alaska" "Georgia" "Louisiana"
[5] "Mississippi" "North Carolina" "South Carolina" "Tennessee"
Cluster #2:
subset(names(hc.cut.scaled), hc.cut.scaled==2)
[1] "Arizona" "California" "Colorado" "Florida" "Illinois"
[6] "Maryland" "Michigan" "Nevada" "New Mexico" "New York"
[11] "Texas"
Cluster #3:
subset(names(hc.cut.scaled), hc.cut.scaled==3)
[1] "Arkansas" "Connecticut" "Delaware" "Hawaii"
[5] "Idaho" "Indiana" "Iowa" "Kansas"
[9] "Kentucky" "Maine" "Massachusetts" "Minnesota"
[13] "Missouri" "Montana" "Nebraska" "New Hampshire"
[17] "New Jersey" "North Dakota" "Ohio" "Oklahoma"
[21] "Oregon" "Pennsylvania" "Rhode Island" "South Dakota"
[25] "Utah" "Vermont" "Virginia" "Washington"
[29] "West Virginia" "Wisconsin" "Wyoming"
table(hc.cut.scaled, hc.cut)
hc.cut
hc.cut.scaled 1 2 3
1 6 2 0
2 9 2 0
3 1 10 20
USArrests
data set, the variables are measured in different units: Murder, Rape and Assault are measured as the number of occurences per 100,000 people, while UrbanPop is measured as the percentage of a State’s population that lives in Urban (as opposed to Rural) areas. The individual variances of these raw variables are:
apply(USArrests, 2, var)
Murder Assault UrbanPop Rape
18.97047 6945.16571 209.51878 87.72916
apply(scale(USArrests), 2, var)
Murder Assault UrbanPop Rape
1 1 1 1
rnorm()
function; runif()
is another option. Be sure to add a mean shift to the observations in each class so that there are three distinct classes.
set.seed(1)
x1 = matrix(rnorm(20 * 50, mean=0), ncol=50)
set.seed(2)
x2 = matrix(rnorm(20 * 50, mean=1), ncol=50)
set.seed(3)
x3 = matrix(rnorm(20 * 50, mean=2), ncol=50)
x = rbind(x1, x2, x3)
sample.id = c(rep(1, 20), rep(2, 20), rep(3, 20))
plot(x, col=sample.id, pch=20)
plot(prcomp(x)$x[,1:2], pch=20, col=sample.id)
table()
function in R to compare the true class labels to the class labels obtained by clustering. Be careful how you interpret the results: K-Means Clustering will arbitrarily number the clusters, so you cannot simply check whether the true class labels and clustering labels are the same.
cluster = kmeans(x, 3, nstart=20)$cluster
table(cluster, sample.id)
sample.id
cluster 1 2 3
1 0 0 20
2 0 20 0
3 20 0 0
cluster = kmeans(x, 2, nstart=20)$cluster
table(cluster, sample.id)
sample.id
cluster 1 2 3
1 20 0 0
2 0 20 20
cluster = kmeans(x, 4, nstart=20)$cluster
table(cluster, sample.id)
sample.id
cluster 1 2 3
1 9 0 0
2 0 20 0
3 11 0 0
4 0 0 20
cluster = kmeans(prcomp(x)$x[,1:2], 3, nstart=20)$cluster
table(cluster, sample.id)
sample.id
cluster 1 2 3
1 20 0 0
2 0 20 0
3 0 0 20
scale()
function perform K-Means Clustering with K = 3 on the data after scaling each variable to have standard deviation of one. How do these results compare to those obtained in (b)? Explain.
cluster = kmeans(scale(x), 3, nstart=20)$cluster
table(cluster, sample.id)
sample.id
cluster 1 2 3
1 0 20 0
2 0 0 20
3 20 0 0
scale()
function subtracts the mean of each column from each of the values in that column in order to ‘re-centre’ the data. It then divides each resulting value in that column by the standard deviation of that column so that the final data in each column has a standard deviation of 1. The way we constructed our data set in (a) was such that, in each column, we have 3 subsets of data - all with the same standard deviation but with different means. Subtracting the mean of the whole column from these ‘subset’ means does not change the differences between the subset means. Consequently, there is still sufficient differences between the subsets such that the K-Means algorithm will still cluster the same data into the same clusters irrespective of whether or not the data has been scaled.