Importing the data set


1. Data Preparation

disney1 <-  disney %>%  
  filter(Age < 24) %>%  #filter for young consumer segment: <24 years old 
  mutate("Nationality_4_TEXT" = NULL) #remove respondent.ID(unnecessary), nationality_4_text columns(too many missing values)
head(disney1)
##   Respondent.ID Gender Age Nationality Living HHSize StreamingUsage Device
## 1             1      1  17           2      1      5              5      4
## 2             2      2  17           2      3      1              5      3
## 3             3      1  19           4      2     16              1      1
## 4             4      2  17           3      1      5              2      4
## 5             5      2  18           3      1      4              3      1
## 6             6      1  18           2      1      6              4      1
##   UnaidedbrandrecallDisney WTP Pastime Entertainment Recommendation ZStreamingUsage
## 1                        0  15       4           6.0           3.50         1.50446
## 2                        1  20       5           5.0           4.50         1.50446
## 3                        0  20       2           5.5           4.00        -1.23897
## 4                        0  30       3           5.5           4.50        -0.55311
## 5                        0  15       6           6.5           5.25         0.13275
## 6                        0  25       4           7.0           4.00         0.81860
##   ZEntertainment ZRecommendation   ZPastime
## 1     -1.2385096     -0.37750756  0.1604940
## 2      0.6937862      0.49366374  0.8086429
## 3      0.6937862      0.05807809 -1.1358037
## 4      0.8870158      0.49366374 -0.4876549
## 5     -1.2385096      1.14704221  1.4567917
## 6      1.4667046      0.05807809  0.1604940
#Base
base <- matrix(disney1$WTP, nrow = 94, ncol=1)
basis = disney1 %>%select(WTP)
descriptors = disney1 %>% select(-WTP, -StreamingUsage, -Recommendation, -Pastime, -Entertainment )


Application of k-means clustering to form relevant segments in the data using consumers’ WTP (in €) as the basis for segmentation.

#make this example reproducible
set.seed(1)

#perform k-means clustering with k = 2 clusters
kmbasis <- kmeans(basis, centers = 2, nstart = 25) #means that R will try 25 different random starting assignments and then select the best results
kmbasis
## K-means clustering with 2 clusters of sizes 56, 38
## 
## Cluster means:
##         WTP
## 1  7.874821
## 2 18.894211
## 
## Clustering vector:
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1
## [42] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [83] 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 860.0826 975.7349
##  (between_SS / total_SS =  60.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
kmbasis$cluster
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1
## [42] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [83] 1 1 1 1 1 1 1 1 1 1 1 1
kmeans <- as.factor(kmbasis$cluster)
kmeans
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1
## [42] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [83] 1 1 1 1 1 1 1 1 1 1 1 1
## Levels: 1 2

Results:


2. Validating Cluster Partitions

#Mean for WTP cluster 1:
cluster1 <- basis %>% filter(kmeans == 1)
mean(cluster1$WTP) #mean = 7.874821
## [1] 7.874821
#Mean for WTP cluster 2:
cluster2 <- basis %>% filter(kmeans == 2)
mean(cluster2$WTP) #mean = 18.89421
## [1] 18.89421
#T-test for differences in mean (WTP):

#Creation of T-test function
ttest <- function(training, testing) {
  x1_mean <- mean(testing)
  x2_mean <- mean(training)
  s1 <- sd(testing)
  s2 <- sd(training)
  n1 <- length(testing)
  n2 <- length(training)
  dfs <- min(n1-1, n2-1)
  tdata <- (x1_mean - x2_mean) / sqrt((s1^2/n1)+(s2^2/n2))
  tdata
  pvalue <- 2*pt(abs(tdata), df = dfs, lower.tail = FALSE)
  return(pvalue)
}

ttest(training = cluster1$WTP, testing = cluster2$WTP) #t-test result: p-value of 2.066837e-13
## [1] 2.066837e-13

Results:


3. Analysis

Analysis for significant differences in the demographic variables (i.e., gender, nationality, household size) and the device they predominately use for online streaming between the two clusters.

#Since all the demographic variables are numerical, the t-test will be used.
#List of demographic variables: gender, age, nationality, living, HHsize

cluster1d = descriptors[which(kmbasis$cluster == 1),] #split descriptors into 2 clusters
cluster2d = descriptors[which(kmbasis$cluster == 2),]

#Hypothesis test for Gender:
x1 <- sum(cluster1d$Gender == 1)
x2 <- sum(cluster2d$Gender == 1)
n1 <- length(cluster1d$Gender)
n2 <- length(cluster2d$Gender)
p1 <- x1 / n1
p2 <- x2 / n2
ppooled <- (x1+x2) / (n1+n2)
zdata <- (p1-p2) / sqrt(ppooled*(1-ppooled)*((1/n1)+(1/n2)))
pvalue <- 2*pnorm(abs(zdata), lower.tail = FALSE)
pvalue
## [1] 0.1499454
#Hypothesis test for Age:
ttest(training = cluster1d$Age, testing = cluster2d$Age)
## [1] 1.835906e-05
#Hypothesis test for Nationality:
freq1_1 <- sum(cluster1d$Nationality == 1)
freq1_2 <- sum(cluster1d$Nationality == 2)
freq1_3 <- sum(cluster1d$Nationality == 3)
freq1_4 <- sum(cluster1d$Nationality == 4)

freq2_1 <- sum(cluster2d$Nationality == 1)
freq2_2 <- sum(cluster2d$Nationality == 2)
freq2_3 <- sum(cluster2d$Nationality == 3)
freq2_4 <- sum(cluster2d$Nationality == 4)
freq_table <- as.table(rbind(c(freq1_1, freq1_2, freq1_3, freq1_4),
                             c(freq2_1, freq2_2, freq2_3, freq2_4)))
dimnames(freq_table) <- list(
Data.Set = c("Cluster 1", "Cluster 2"),
Status = c("Dutch", "German", "Belgian", "Other"))
freq_table
##            Status
## Data.Set    Dutch German Belgian Other
##   Cluster 1     4     23       7    22
##   Cluster 2     0     16       8    14
Xsq_data <- chisq.test(freq_table)
Xsq_data$statistic
## X-squared 
##  3.793133
Xsq_data$p.value
## [1] 0.2846858
#Hypothesis test for Living:
ttest(training = cluster1d$Living, testing = cluster2d$Living)
## [1] 0.5991855
#Hypothesis test for HHsize:
ttest(training = cluster1d$HHSize, testing = cluster2d$HHSize)
## [1] 0.04665731
#Since Device is a nominal variable, the Chi-squared test will be used.
freq1_1 <- sum(cluster1d$Device == 1)
freq1_2 <- sum(cluster1d$Device == 2)
freq1_3 <- sum(cluster1d$Device == 3)
freq1_4 <- sum(cluster1d$Device == 4)

freq2_1 <- sum(cluster2d$Device == 1)
freq2_2 <- sum(cluster2d$Device == 2)
freq2_3 <- sum(cluster2d$Device == 3)
freq2_4 <- sum(cluster2d$Device == 4)
freq_table <- as.table(rbind(c(freq1_1, freq1_2, freq1_3, freq1_4),
                             c(freq2_1, freq2_2, freq2_3, freq2_4)))
dimnames(freq_table) <- list(
Data.Set = c("Cluster 1", "Cluster 2"),
Status = c("Laptop", "PC", "Tablet", "Phone"))
freq_table
##            Status
## Data.Set    Laptop PC Tablet Phone
##   Cluster 1     33  4      4    15
##   Cluster 2     14  2     10    12
Xsq_data <- chisq.test(freq_table)

Xsq_data$statistic
## X-squared 
##  8.102578
Xsq_data$p.value
## [1] 0.04393862

Results:


4. Data Visualisation

Interpretation of the Snake Chart:


#Hypothesis tests
ttest(training = cluster1d$ZStreamingUsage, testing = cluster2d$ZStreamingUsage)
## [1] 0.1577568
ttest(training = cluster1d$ZEntertainment, testing = cluster2d$ZEntertainment)
## [1] 0.3048904
ttest(training = cluster1d$ZRecommendation, testing = cluster2d$ZRecommendation)
## [1] 0.9609302
ttest(training = cluster1d$ZPastime, testing = cluster2d$ZPastime)
## [1] 0.8829256


Results: All variables used have p-values that are much larger than the 5% or 10% significance levels. This implies that none of the clusters have statistically significant different means.

5. Conclusion

Meaningful insights for Disney+.

Insights:

Overall, it seems that these youths do not enjoy streaming online videos for entertainment and are also not easily swayed by recommendations offered via 3rd party streaming services. For Disney to appeal to this niche market, it would be wise to conduct more intensive research into youth’s behavioural and psychographic characteristics.


Characterisation:
Cluster 1 - Respondents are generally more pragmatic and cautious about how they consume online streaming services.

Cluster 2 - Repondents are generally more receptive, open-minded, . They could also be more creative, playful youths since one would have to be rather imaginative to enjoy Disney’s productions.


Marketing Suggestions: