It was a challenging task, but I believe I have found a suitable solution :) Although I couldn't extract the significance values, I managed to examine the values used to construct the graph. In essence, you need to modify the source code of the plot.STM function and save the values into a dataframe. Below I provide an example of how I accomplished this for 25 topics. In each iteration of the loop, the words and their corresponding values are saved in a dataframe.
funktion <- function (x, type = c("summary", "labels", "perspectives", "hist"),
n = NULL, topics = NULL, labeltype = c("prob", "frex", "lift",
"score"), frexw = 0.5, main = NULL, xlim = NULL, ylim = NULL,
xlab = NULL, family = "", width = 80, covarlevels = NULL,
plabels = NULL, text.cex = 1, custom.labels = NULL, topic.names = NULL,
...)
{
model <- x
content_df <- data.frame(column_name = rep(1, 24))
print(content_df)
type <- match.arg(type)
contentcov <- length(model$beta$logbeta) != 1
if (contentcov & !missing(labeltype))
stop("Cannot specify label type for content covariate models.")
labeltype <- match.arg(labeltype)
if (!is.null(custom.labels))
labeltype <- "custom"
if (is.null(n))
n <- switch(type, summary = 3, labels = 20, perspectives = 25,
hist = 3)
if (type != "perspectives" & is.null(topics))
topics <- 1:model$settings$dim$K
if (labeltype != "custom") {
if (type != "perspectives") {
lab <- labelTopics(model, topics = topics, n = n,
frexweight = frexw)
if (contentcov) {
lab <- lab$topics
}
else {
lab <- lab[[labeltype]]
}
}
}
else {
lab <- custom.labels
if (length(lab) != length(topics))
lab <- rep_len(lab, length.out = length(topics))
}
if (!is.null(topic.names))
topic.names <- rep_len(topic.names, length.out = length(topics))
if (type == "perspectives") {
if (!contentcov)
covarlevels <- c(1, 1)
if (is.null(topics))
stop("Must specify one or two topic numbers using topics")
if (length(topics) > 2)
stop("Too many topics specified.")
if (length(topics) == 1) {
topics <- rep(topics, 2)
}
sametopics <- topics[1] == topics[2]
if (is.null(covarlevels)) {
covarlevels <- c(1, 2)
}
else {
if (length(covarlevels) > 2)
stop("More than two covariate levels specified.")
if (length(covarlevels) == 1)
covarlevels <- rep(covarlevels, 2)
covlabels <- model$seetings$covariates$yvarlevels
if (is.character(covarlevels)) {
covarlevels <- pmatch(covarlevels, model$settings$covariates$yvarlevels)
if (any(is.na(covarlevels)))
stop("Unrecognized covariate levels")
}
}
samecovars <- covarlevels[1] == covarlevels[2]
if (is.null(plabels)) {
plabels <- vector(length = 2)
if (!contentcov) {
plabels[1] <- paste0(c("Topic ", topics[1]),
collapse = "")
plabels[2] <- paste0(c("Topic ", topics[2]),
collapse = "")
}
else {
if (sametopics & !samecovars) {
plabels[1] <- paste0(c(model$settings$covariates$yvarlevels[covarlevels[1]],
"\n", "(Topic ", topics[1], ")"), collapse = "")
plabels[2] <- paste0(c(model$settings$covariates$yvarlevels[covarlevels[2]],
"\n", "(Topic ", topics[2], ")"), collapse = "")
}
else {
plabels[1] <- paste0(c("Topic ", topics[1],
") \n", "(", model$settings$covariates$yvarlevels[covarlevels[1]],
")"), collapse = "")
plabels[2] <- paste0(c("Topic ", topics[2],
") \n", "(", model$settings$covariates$yvarlevels[covarlevels[2]],
")"), collapse = "")
}
}
}
left <- model$beta$logbeta[[covarlevels[1]]][topics[1],
]
right <- model$beta$logbeta[[covarlevels[2]]][topics[2],
]
nd2 <- floor(n/2)
words <- (c(order(left, decreasing = TRUE)[1:nd2],
order(right, decreasing = TRUE)[1:nd2]))
print(words)
content_df$id <- words
scale <- pmax(exp(left[words]) + exp(right[words]))
scale <- asinh((scale/max(scale)) * 4)
diff <- exp(left[words]) - exp(right[words])
diff <- diff/max(abs(diff))
print(diff)
content_df$diff <- diff
plot(c(0, 0), xlim = c(max(diff) + 0.1, min(diff) -
0.1), ylim = c(-2 * length(diff), 6 * length(diff)),
type = "n", xaxt = "n", xlab = "", main = main,
yaxt = "n", ylab = "", bty = "n", ...)
segments(0, 0, 0, -2 * length(diff), lty = 2)
rand <- sample(seq(1, 6 * length(diff), by = 2), length(diff),
replace = F)
thresh <- 0.1
negdiff <- diff * (diff < -thresh)
posdiff <- diff * (diff > thresh)
middiff <- diff * (diff < thresh & diff > -thresh)
colors <- grDevices::rgb(posdiff + 0.75, 0.75, -negdiff +
0.75, maxColorValue = 2)
text(diff, rand, model$vocab[words], cex = text.cex *
scale, col = colors, family = family)
text(0.75 * min(diff), -length(diff), as.character(plabels[2]),
col = grDevices::rgb(0.5, 0.5, 1.6, 2, maxColorValue = 2),
cex = 2, pos = 1)
text(0.75 * max(diff), -length(diff), as.character(plabels[1]),
col = grDevices::rgb(1.6, 0.5, 0.5, 2, maxColorValue = 2),
cex = 2, pos = 1)
segments(min(diff), -0.5 * length(diff), max(diff),
-0.5 * length(diff))
print(model$vocab[words])
content_df$word <- model$vocab[words]
content_df$topic <- i
return(content_df)
}
}
result_df <- data.frame()
for (i in 1:25) {
content_df <- funktion(stmContent2, type = "perspective",
topics = i,
covariate = "lang",
text.cex = .9,
main = paste("Thema", i, "(De)"))
result_df <- rbind(result_df, content_df)
}