I am going to calculate similarity between almost 14 thousand documents. But code is taking too much time for execution. Is there any other way to do same work faster?
Here is my code
wb=createWorkbook() #create workbook
addWorksheet(wb,"absSim") #create worksheet
listoffiles=list.files() #get list of documents from current working directory
fileslength=length(listoffiles) #no of documents in directory
for(i in 1:fileslength-1)
{
d1=readLines(listoffiles[i])# read first document
k=i+1
for(j in k:fileslength)
{
d2=readLines(listoffiles[j]) #read second document
#make a vector of two documents
myvector=c(d1,d2)
#making corpus of two documents
mycorpus=Corpus(VectorSource(myvector))
#preprocessing of corpus
mycorpus=tm_map(mycorpus,removePunctuation)
mycorpus=tm_map(mycorpus,removeNumbers)
mycorpus=tm_map(mycorpus,stripWhitespace)
mycorpus=tm_map(mycorpus,tolower)
mycorpus=tm_map(mycorpus,function(x) removeWords(x,stopwords("english")))
mycorpus=tm_map(mycorpus,function(x) removeWords(x,"x"))
#make a document term matrix now
dtm=as.matrix(DocumentTermMatrix(mycorpus))
#compute distance of both documents using proxy package
cdist=as.matrix(dist(dtm,method = "cosine"))
jdist=as.matrix(dist(dtm,method = "jaccard"))
#compute similarity
csim=1-cdist
jsim=1-jdist
#get similarity of both documents
cos=csim[1,2]
jac=jsim[1,2]
if(cos>0 | jac>0)
{
writeData(wb,"absSim",cos,startCol = 1,startRow = rownum)
writeData(wb,"absSim",jac,startCol = 2,startRow = rownum)
saveWorkbook(wb,"abstractSimilarity.xlsx",overwrite = TRUE)
rownum=rownum+1
}
}
}
When I run this code, the first document executed in 2 hr. Is there any idea to calculate cosine and jaccard similarity faster?