3

my question relates to structural topic modeling in R, specifically to the stm package developed by Roberts et al. (https://cran.r-project.org/web/packages/stm/vignettes/stmVignette.pdf).

I implemented a structural topic model in order to investigate, whether there is a statistically significant difference in the vocabulary with which women and men describe certain topics. Thus my question relates to the word rates used in discussing a topic, the authors of the vignette refer to this as topical content analysis, see page 19.

The implementation of the code was successful and I manage to create a similar graph to the one shown in Figure 8 of the Vignette.

My question now is, how do I know whether the difference in the vocabulary with which in my case women and men describe topics is statistically significant?

And is there a way to plot this for all of my topics in one graph?

Thank you!

My code:

Estimate the Topic Model

stmContent2 <- stm(out$documents, 
                  out$vocab,
                  K = 80, 
                  prevalence =~ gender,
                  content =~ gender,
                  max.em.its = 75,
                  data = out$meta, 
                  init.type = "Spectral",
                  seed = 8458302)

plot(stmContent2, type = "perspectives", topics = 11)
Marco Sandri
  • 23,289
  • 7
  • 54
  • 58
RAnnR
  • 31
  • 1

1 Answers1

1

It was a challenging task, but I believe I have found a suitable solution :) Although I couldn't extract the significance values, I managed to examine the values used to construct the graph. In essence, you need to modify the source code of the plot.STM function and save the values into a dataframe. Below I provide an example of how I accomplished this for 25 topics. In each iteration of the loop, the words and their corresponding values are saved in a dataframe.

funktion <- function (x, type = c("summary", "labels", "perspectives", "hist"), 
            n = NULL, topics = NULL, labeltype = c("prob", "frex", "lift", 
                                                   "score"), frexw = 0.5, main = NULL, xlim = NULL, ylim = NULL, 
            xlab = NULL, family = "", width = 80, covarlevels = NULL, 
            plabels = NULL, text.cex = 1, custom.labels = NULL, topic.names = NULL, 
            ...) 
  {
    model <- x
    content_df <- data.frame(column_name = rep(1, 24))
    print(content_df)
    type <- match.arg(type)
    contentcov <- length(model$beta$logbeta) != 1
    if (contentcov & !missing(labeltype)) 
      stop("Cannot specify label type for content covariate models.")
    labeltype <- match.arg(labeltype)
    if (!is.null(custom.labels)) 
      labeltype <- "custom"
    if (is.null(n)) 
      n <- switch(type, summary = 3, labels = 20, perspectives = 25, 
                  hist = 3)
    if (type != "perspectives" & is.null(topics)) 
      topics <- 1:model$settings$dim$K
    if (labeltype != "custom") {
      if (type != "perspectives") {
        lab <- labelTopics(model, topics = topics, n = n, 
                           frexweight = frexw)
        if (contentcov) {
          lab <- lab$topics
        }
        else {
          lab <- lab[[labeltype]]
        }
      }
    }
    else {
      lab <- custom.labels
      if (length(lab) != length(topics)) 
        lab <- rep_len(lab, length.out = length(topics))
    }
    if (!is.null(topic.names)) 
      topic.names <- rep_len(topic.names, length.out = length(topics))
    if (type == "perspectives") {
      if (!contentcov) 
        covarlevels <- c(1, 1)
      if (is.null(topics)) 
        stop("Must specify one or two topic numbers using topics")
      if (length(topics) > 2) 
        stop("Too many topics specified.")
      if (length(topics) == 1) {
        topics <- rep(topics, 2)
      }
      sametopics <- topics[1] == topics[2]
      if (is.null(covarlevels)) {
        covarlevels <- c(1, 2)
      }
      else {
        if (length(covarlevels) > 2) 
          stop("More than two covariate levels specified.")
        if (length(covarlevels) == 1) 
          covarlevels <- rep(covarlevels, 2)
        covlabels <- model$seetings$covariates$yvarlevels
        if (is.character(covarlevels)) {
          covarlevels <- pmatch(covarlevels, model$settings$covariates$yvarlevels)
          if (any(is.na(covarlevels))) 
            stop("Unrecognized covariate levels")
        }
      }
      samecovars <- covarlevels[1] == covarlevels[2]
      if (is.null(plabels)) {
        plabels <- vector(length = 2)
        if (!contentcov) {
          plabels[1] <- paste0(c("Topic ", topics[1]), 
                               collapse = "")
          plabels[2] <- paste0(c("Topic ", topics[2]), 
                               collapse = "")
        }
        else {
          if (sametopics & !samecovars) {
            plabels[1] <- paste0(c(model$settings$covariates$yvarlevels[covarlevels[1]], 
                                   "\n", "(Topic ", topics[1], ")"), collapse = "")
            plabels[2] <- paste0(c(model$settings$covariates$yvarlevels[covarlevels[2]], 
                                   "\n", "(Topic ", topics[2], ")"), collapse = "")
          }
          else {
            plabels[1] <- paste0(c("Topic ", topics[1], 
                                   ") \n", "(", model$settings$covariates$yvarlevels[covarlevels[1]], 
                                   ")"), collapse = "")
            plabels[2] <- paste0(c("Topic ", topics[2], 
                                   ") \n", "(", model$settings$covariates$yvarlevels[covarlevels[2]], 
                                   ")"), collapse = "")
          }
        }
      }
      left <- model$beta$logbeta[[covarlevels[1]]][topics[1], 
      ]
      right <- model$beta$logbeta[[covarlevels[2]]][topics[2], 
      ]
      nd2 <- floor(n/2)
      words <- (c(order(left, decreasing = TRUE)[1:nd2], 
                        order(right, decreasing = TRUE)[1:nd2]))
      print(words)
      content_df$id <- words 
      scale <- pmax(exp(left[words]) + exp(right[words]))
      scale <- asinh((scale/max(scale)) * 4)
      diff <- exp(left[words]) - exp(right[words])
      diff <- diff/max(abs(diff))
      print(diff)
      content_df$diff <- diff
      plot(c(0, 0), xlim = c(max(diff) + 0.1, min(diff) - 
                               0.1), ylim = c(-2 * length(diff), 6 * length(diff)), 
           type = "n", xaxt = "n", xlab = "", main = main, 
           yaxt = "n", ylab = "", bty = "n", ...)
      segments(0, 0, 0, -2 * length(diff), lty = 2)
      rand <- sample(seq(1, 6 * length(diff), by = 2), length(diff), 
                     replace = F)
      thresh <- 0.1
      negdiff <- diff * (diff < -thresh)
      posdiff <- diff * (diff > thresh)
      middiff <- diff * (diff < thresh & diff > -thresh)
      colors <- grDevices::rgb(posdiff + 0.75, 0.75, -negdiff + 
                                 0.75, maxColorValue = 2)
      text(diff, rand, model$vocab[words], cex = text.cex * 
             scale, col = colors, family = family)
      text(0.75 * min(diff), -length(diff), as.character(plabels[2]), 
           col = grDevices::rgb(0.5, 0.5, 1.6, 2, maxColorValue = 2), 
           cex = 2, pos = 1)
      text(0.75 * max(diff), -length(diff), as.character(plabels[1]), 
           col = grDevices::rgb(1.6, 0.5, 0.5, 2, maxColorValue = 2), 
           cex = 2, pos = 1)
      segments(min(diff), -0.5 * length(diff), max(diff), 
               -0.5 * length(diff))
      print(model$vocab[words])
      content_df$word <- model$vocab[words]
      content_df$topic <- i
      return(content_df)
    }
}

result_df <- data.frame()

for (i in 1:25) {
  content_df <- funktion(stmContent2, type = "perspective", 
                         topics = i, 
                         covariate = "lang",
                         text.cex = .9,
                         main = paste("Thema", i, "(De)"))
  
  result_df <- rbind(result_df, content_df)
}