R: tm package on German text

Question

I want to perform Sentiment classification on German dataset, I am using the following code, which works fine with english text, but raising error in case of German text.

Here is my code for the following:

#loading required libraries
library(tm)
library(readxl)
library(data.table)
library(plyr)
library(dplyr)
library(zoo)
library(ggplot2)
library(ranger)
library(e1071)

df<- data.table(read_excel("data/German2datasets.xlsx", skip = 1))

# An abstract function to preprocess a text column
preprocess <- function(text_column)
{
  # Use tm to get a doc matrix
  corpus <- Corpus(VectorSource(text_column))
  # all lower case
  corpus <- tm_map(corpus, content_transformer(tolower))
  # remove punctuation
  corpus <- tm_map(corpus, content_transformer(removePunctuation))
  # remove numbers
  corpus <- tm_map(corpus, content_transformer(removeNumbers))
  # remove stopwords
  corpus <- tm_map(corpus, removeWords, stopwords("german"))
  # stem document
  corpus <- tm_map(corpus, stemDocument)
  # strip white spaces (always at the end)
  corpus <- tm_map(corpus, stripWhitespace)
  # return
  corpus    
}

# Get preprocess training and test data
corpus <- preprocess(df$TEXT)


# Create a Document Term Matrix for train and test
# Just including bi and tri-grams

Sys.setenv(JAVA_HOME='D://Program Files/Java/jre1.8.0_112') # for 32-bit version
library(rJava)
library(RWeka)

# Bi-Trigram tokenizer function (you can always get longer n-grams)
bitrigramtokeniser <- function(x, n) {
  RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3))
}


"
Remove remove words <=2
TdIdf weighting
Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included
"

dtm <- DocumentTermMatrix(corpus, control=list(wordLengths=c(2, Inf), 
                                               tokenize = bitrigramtokeniser, 
                                               weighting = function(x) weightTfIdf(x, normalize = FALSE),
                                               bounds=list(global=c(floor(length(corpus)*0.01), floor(length(corpus)*.8)))))


sent <- df$Sentiment

# Variable selection
# ~~~~~~~~~~~~~~~~~~~~
"
For dimension reduction.
The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values
Ideally you want to put variable selection as part of cross-validation.

chisqTwo function takes:
document term matrix (dtm), 
vector of labels (labels), and 
number of n-grams you want to keep (n_out)

"
chisqTwo <- function(dtm, labels, n_out=2000){
  mat       <- as.matrix(dtm)
  cat1      <-  colSums(mat[labels==T,])        # total number of times phrase used in cat1 
  cat2      <-  colSums(mat[labels==F,])        # total number of times phrase used in cat2 
  n_cat1        <-  sum(mat[labels==T,]) - cat1     # total number of phrases in soft minus cat1
  n_cat2        <-  sum(mat[labels==F,]) - cat2     # total number of phrases in hard minus cat2

  num       <- (cat1*n_cat2 - cat2*n_cat1)^2
  den       <- (cat1 + cat2)*(cat1 + n_cat1)*(cat2 + n_cat2)*(n_cat1 + n_cat2)
  chisq         <- num/den

  chi_order <- chisq[order(chisq)][1:n_out]   
  mat       <- mat[, colnames(mat) %in% names(chi_order)]

}


n <- nrow(dtm)
shuffled <- dtm[sample(n),]
train_dtm <- shuffled[1:round(0.7 * n),]
test_dtm <- shuffled[(round(0.7 * n) + 1):n,]


"
With high dimensional data, test matrix may not have all the phrases training matrix has.
This function fixes that - so that test matrix has the same columns as training.
testmat takes column names of training matrix (train_mat_cols), and 
test matrix (test_mat)
and outputs test_matrix with the same columns as training matrix
"
# Test matrix maker
testmat <- function(train_mat_cols, test_mat){  
  # train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm)
  test_mat  <- test_mat[, colnames(test_mat) %in% train_mat_cols]

  miss_names    <- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))]
  if(length(miss_names)!=0){
    colClasses  <- rep("numeric", length(miss_names))
    df          <- read.table(text = '', colClasses = colClasses, col.names = miss_names)
    df[1:nrow(test_mat),] <- 0
    test_mat    <- cbind(test_mat, df)
  }
  as.matrix(test_mat)
}

# Train and test matrices
train_mat <- chisqTwo(train_dtm, train$Sentiment)
test_mat  <- testmat(colnames(train_mat), as.matrix(test_dtm))

dim(train_mat)
dim(test_mat)


n <- nrow(df)
shuffled <- df[sample(n),]
train_data <- shuffled[1:round(0.7 * n),]
test_data <- shuffled[(round(0.7 * n) + 1):n,]

train_mat <- as.data.frame(as.matrix(train_mat))
colnames(train_mat) <- make.names(colnames(train_mat))
train_mat$Sentiment <- train_data$Sentiment

test_mat <- as.data.frame(as.matrix(test_mat))
colnames(test_mat) <- make.names(colnames(test_mat))
test_mat$Sentiment <- test_data$Sentiment

train_mat$Sentiment <- as.factor(train_mat$Sentiment)
test_mat$Sentiment <- as.factor(test_mat$Sentiment)

Then, I will apply caret ML algos on the same for prediction of the Sentiment on the train and test data created.

I am getting the following error at "preprocess" function.

> corpus <- preprocess(df$TEXT)
 Show Traceback

 Rerun with Debug
 Error in FUN(content(x), ...) : 
  invalid input 'Ich bin seit Jahren zufrieden mit der Basler VersicherubgðŸŒº' in 'utf8towcs'

Data - https://drive.google.com/open?id=1T_LpL2G8upztihAC2SQeVs4YCPH-yfOs

first check to see if the German text is being read in properly. That might be the issue. As for nlp work for non-English languages you might want to look into `udpipe`. — phiver, Jan 29 '18 at 18:42

Ken Benoit · Answer 1 · 2018-04-03T12:21:20.783

How about trying a different package to get to the pre-Weka etc stages? This is equivalent (and simpler imho):

library("quanteda")
library("readtext")

# reads in the spreadsheet and creates the corpus
germancorp <- 
    readtext("data/German2datasets.xlsx", text_field = "TEXT")) %>%
    corpus()

# does all of the steps of your preprocess() function
dtm <- dfm(germancorp, ngrams = c(2, 3),
           tolower = TRUE,
           remove_punct = TRUE,
           remove_numbers = TRUE,
           remove = stopwords("german"),
           stem = TRUE)

# remove words with only a single count
dtm <- dfm_trim(dtm, min_count = 2)

# form tf-idf weights - change the base argument from default 10 if you wish
dtm <- dfm_tfidf(dtm)

# if you really want a tm formatted DocumentTermMatrix
convert(dtm, to = "tm")

The quanteda package can do some of what you list as additional steps, although it is not clear exactly what you are doing. (Your question focused on the preprocess() failure so I answered that.)

score 0 · Answer 2 · answered Apr 02 '18 at 17:45

if you haven´t found the reason yet: invalid input in 'utf8towcs'

It is the encoding of the file (depending on your [virtual] environment and the current sys-options and of course on the the of saving the file to disk at the time of creation)

A workaround is like:

usableText=str_replace_all(tweets$text,"[^[:graph:]]", " ")

or

your_corpus<- tm_map(your_corpus,toSpace,"[^[:graph:]]")

R: tm package on German text

2 Answers2