Here's one approach... does it work for your data? see further down for details that include the OP's data
# load text mining library
library(tm)
# make first corpus for text mining (data comes from package, for reproducibility)
data("crude")
corpus1 <- Corpus(VectorSource(crude[1:10]))
# process text (your methods may differ)
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers,
stripWhitespace, skipWords, MinDocFrequency=5)
crude1 <- tm_map(corpus1, FUN = tm_reduce, tmFuns = funcs)
crude1.dtm <- TermDocumentMatrix(crude1, control = list(wordLengths = c(3,10)))
# prepare 2nd corpus
corpus2 <- Corpus(VectorSource(crude[11:20]))
# process text as above
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
crude2 <- tm_map(corpus2, FUN = tm_reduce, tmFuns = funcs)
crude2.dtm <- TermDocumentMatrix(crude1, control = list(wordLengths = c(3,10)))
crude2.dtm.mat <- as.matrix(crude2.dtm)
# subset second corpus by words in first corpus
crude2.dtm.mat[rownames(crude2.dtm.mat) %in% crude1.dtm.freq, ]
Docs
Terms reut-00001.xml reut-00002.xml reut-00004.xml reut-00005.xml reut-00006.xml
oil 5 12 2 1 1
opec 0 15 0 0 0
prices 3 5 0 0 0
Docs
Terms reut-00007.xml reut-00008.xml reut-00009.xml reut-00010.xml reut-00011.xml
oil 7 4 3 5 9
opec 8 1 2 2 6
prices 5 1 2 1 9
UPDATE after data provided and comments I think this a bit closer to your question.
Here's the same process using document term matrices instead of TDMs (as I used above, a slight variation):
# load text mining library
library(tm)
# make corpus for text mining (data comes from package, for reproducibility)
data("crude")
corpus1 <- Corpus(VectorSource(crude[1:10]))
# process text (your methods may differ)
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
crude1 <- tm_map(corpus1, FUN = tm_reduce, tmFuns = funcs)
crude1.dtm <- DocumentTermMatrix(crude1, control = list(wordLengths = c(3,10)))
corpus2 <- Corpus(VectorSource(crude[11:20]))
# process text (your methods may differ)
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers,
stripWhitespace, skipWords, MinDocFrequency=5)
crude2 <- tm_map(corpus2, FUN = tm_reduce, tmFuns = funcs)
crude2.dtm <- DocumentTermMatrix(crude1, control = list(wordLengths = c(3,10)))
crude2.dtm.mat <- as.matrix(crude2.dtm)
crude2.dtm.mat[,colnames(crude2.dtm.mat) %in% crude1.dtm.freq ]
Terms
Docs oil opec prices
reut-00001.xml 5 0 3
reut-00002.xml 12 15 5
reut-00004.xml 2 0 0
reut-00005.xml 1 0 0
reut-00006.xml 1 0 0
reut-00007.xml 7 8 5
reut-00008.xml 4 1 1
reut-00009.xml 3 2 2
reut-00010.xml 5 2 1
reut-00011.xml 9 6 9
And here's a solution using the data added into the OP's question
text <- c('saying text is good',
'saying text once and saying text twice is better',
'saying text text text is best',
'saying text once is still ok',
'not saying it at all is bad',
'because text is a good thing',
'we all like text',
'even though sometimes it is missing')
validationText <- c("This has different words in it.",
"But I still want to count",
"the occurence of text",
"for example")
TextCorpus <- Corpus(VectorSource(text))
ValiTextCorpus <- Corpus(VectorSource(validationText))
Control = list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, MinDocFrequency=5)
TextDTM = DocumentTermMatrix(TextCorpus, Control)
ValiTextDTM = DocumentTermMatrix(ValiTextCorpus, Control)
# find high frequency terms in TextDTM
(TextDTM.hifreq <- findFreqTerms(TextDTM, 5))
[1] "saying" "text"
# find out how many times each high freq word occurs in TextDTM
TextDTM.mat <- as.matrix(TextDTM)
colSums(TextDTM.mat[,TextDTM.hifreq])
saying text
6 9
Here are the key lines, subset the second DTM based on the list of high-frequency words from the first DTM. In this case I've used the intersect
function since the vector of high frequency words includes a word that is not in the second corpus at all (and intersect
seems to handle that better than %in%
)
# now look into second DTM
ValiTextDTM.mat <- as.matrix(ValiTextDTM)
common <- data.frame(ValiTextDTM.mat[, intersect(colnames(ValiTextDTM.mat), TextDTM.hifreq) ])
names(common) <- intersect(colnames(ValiTextDTM.mat), TextDTM.hifreq)
text
1 0
2 0
3 1
4 0
How to find the total count of the high freq word(s) in the second corpus:
colSums(common)
text
1