I am using dtw to calculate distances between several series and getting strange results. Notice that in the sample data below the first 9 customers are identical sets (A==B==C, D==E==F, and G==H==I). The remaining rows are only for noise to allow me to make 8 clusters.
I expect that the first sets would be clustered with their identical partners. This happens when I calculate distance on the original data, but when I scale the data before distance/clustering I get different results.
The distances between identical rows in original data is 0.0 (as expected), but with scaled data the distances is not 0.0 (not even close). Any ideas why they are not the same?
library(TSdist)
library(dplyr)
library(tidyr)
mydata = as_data_frame(read.table(textConnection("
cust P1 P2 P3 P4 P5 P6 P7 P8 P9 P10
1 A 1.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
2 B 1.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
3 C 1.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
4 D 0.0 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0
5 E 0.0 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0
6 F 0.0 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0
7 G 2.0 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5
8 H 2.0 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5
9 I 2.0 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5
10 D2 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0 2.0
11 E2 5.0 6.0 5.0 4.0 5.0 6.0 5.0 4.0 5.0 6.0
12 F2 9.0 10.0 9.0 8.0 9.0 10.0 9.0 8.0 9.0 10.0
13 G2 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5 1.0
14 H2 5.5 5.0 4.5 4.0 4.5 5.0 5.5 6.0 5.5 5.0
15 I2 9.5 9.0 8.5 8.0 8.5 9.0 9.5 10.0 9.5 9.0
16 A3 1.0 1.0 0.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0
17 B3 5.0 5.0 5.0 5.0 5.0 3.0 8.0 5.0 5.0 5.0
18 C3 9.0 9.0 9.0 9.0 9.0 5.4 14.4 9.0 9.0 9.0
19 D3 0.0 1.0 2.0 1.0 0.0 1.0 1.0 2.0 0.0 1.0
20 E3 4.0 5.0 5.0 6.0 4.0 5.0 6.0 5.0 4.0 5.0
21 F3 8.0 9.0 10.0 9.0 9.0 9.0 9.0 9.0 8.0 9.0
22 G3 2.0 1.5 1.0 0.5 0.0 0.5 1.0 2.0 1.5 1.5
23 H3 6.0 5.5 5.0 4.5 4.0 5.0 4.5 5.5 6.0 5.5
24 I3 10.0 9.5 9.0 9.0 8.0 8.5 9.0 9.5 10.0 9.5
25 D4 0.0 3.0 6.0 3.0 0.0 3.0 6.0 3.0 0.0 5.0
26 E4 3.0 6.0 9.0 6.0 3.0 6.0 9.0 6.0 3.0 6.0
27 F4 4.0 6.0 10.0 7.0 5.0 6.0 11.0 8.0 5.0 7.0
28 D5 5.0 0.0 3.0 6.0 3.0 0.0 3.0 6.0 3.0 0.0
29 D6 9.0 6.0 3.0 6.0 9.0 6.0 3.0 6.0 9.0 6.0
30 D7 9.0 11.0 5.0 4.0 6.0 10.0 7.0 5.0 6.0 11.0
31 Dw 0.0 0.8 1.4 2.0 1.0 0.0 2.0 0.0 1.0 2.0
32 Ew 4.0 4.8 5.4 6.0 5.0 4.0 6.0 4.0 5.0 6.0
33 Fw 8.0 8.8 9.4 10.0 9.0 8.0 10.0 8.0 9.0 10.0
34 Gw 2.0 1.5 1.0 0.5 0.0 1.0 2.0 1.5 1.3 1.1
35 Hw 6.0 5.5 5.0 4.5 4.0 5.0 6.0 5.5 5.3 5.1
36 Iw 10.0 9.5 9.0 8.5 8.0 9.0 10.0 9.5 9.3 9.1"),
header = TRUE, stringsAsFactors = FALSE))
k=8
# create a scale version of mydata (raw data - mean) / std dev
mydata_long = mydata %>%
mutate (mean = apply(mydata[,2:ncol(mydata)],1,mean,na.rm = T)) %>%
mutate (sd = apply(mydata[,2:(ncol(mydata))],1,sd,na.rm = T))%>%
gather (period,value,-cust,-mean,-sd) %>%
mutate (sc = (value-mean)/sd)
mydata_sc = mydata_long[,-c(2,3,5)] %>%
spread(period,sc)
# dtw
dtw_dist = TSDatabaseDistances(mydata[2:ncol(mydata)], distance = "dtw",lag.max= 2) #distance
dtw_clus = hclust(dtw_dist, method="ward.D2") # Cluster
dtw_res = data.frame(cutree(dtw_clus, k)) # cut dendrogram into 9 clusters
# dtw (w scaled data)
dtw_sc_dist = TSDatabaseDistances(mydata_sc[2:ncol(mydata_sc)], distance = "dtw",lag.max= 2) #distance
dtw_sc_clus = hclust(dtw_sc_dist, method="ward.D2") # Cluster
dtw_sc_res = data.frame(cutree(dtw_sc_clus, k)) # cut dendrogram into 9 clusters
results = cbind (dtw_res,dtw_sc_res)
names(results) = c("dtw", "dtw_scaled")
print(results)
dtw dtw_scaled
1 1 1
2 1 2
3 1 1
4 1 2
5 1 1
6 1 2
7 1 3
8 1 4
9 1 3
10 1 3
11 2 3
12 3 4
13 1 5
14 2 6
15 3 3
16 1 4
17 2 3
18 4 3
19 1 6
20 2 3
21 3 4
22 1 3
23 2 3
24 3 6
25 5 7
26 6 8
27 7 7
28 5 7
29 6 7
30 8 8
31 1 7
32 2 7
33 3 7
34 1 8
35 2 7
36 3 7