15

i have built an index in Lucene. I want without specifying a query, just to get a score (cosine similarity or another distance?) between two documents in the index.

For example i am getting from previously opened IndexReader ir the documents with ids 2 and 4. Document d1 = ir.document(2); Document d2 = ir.document(4);

How can i get the cosine similarity between these two documents?

Thank you

amadib
  • 868
  • 14
  • 33
maiky
  • 3,503
  • 7
  • 28
  • 28

7 Answers7

16

As Julia points out Sujit Pal's example is very useful but the Lucene 4 API has substantial changes. Here is a version rewritten for Lucene 4.

import java.io.IOException;
import java.util.*;

import org.apache.commons.math3.linear.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;

public class CosineDocumentSimilarity {

    public static final String CONTENT = "Content";

    private final Set<String> terms = new HashSet<>();
    private final RealVector v1;
    private final RealVector v2;

    CosineDocumentSimilarity(String s1, String s2) throws IOException {
        Directory directory = createIndex(s1, s2);
        IndexReader reader = DirectoryReader.open(directory);
        Map<String, Integer> f1 = getTermFrequencies(reader, 0);
        Map<String, Integer> f2 = getTermFrequencies(reader, 1);
        reader.close();
        v1 = toRealVector(f1);
        v2 = toRealVector(f2);
    }

    Directory createIndex(String s1, String s2) throws IOException {
        Directory directory = new RAMDirectory();
        Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_CURRENT);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT,
                analyzer);
        IndexWriter writer = new IndexWriter(directory, iwc);
        addDocument(writer, s1);
        addDocument(writer, s2);
        writer.close();
        return directory;
    }

    /* Indexed, tokenized, stored. */
    public static final FieldType TYPE_STORED = new FieldType();

    static {
        TYPE_STORED.setIndexed(true);
        TYPE_STORED.setTokenized(true);
        TYPE_STORED.setStored(true);
        TYPE_STORED.setStoreTermVectors(true);
        TYPE_STORED.setStoreTermVectorPositions(true);
        TYPE_STORED.freeze();
    }

    void addDocument(IndexWriter writer, String content) throws IOException {
        Document doc = new Document();
        Field field = new Field(CONTENT, content, TYPE_STORED);
        doc.add(field);
        writer.addDocument(doc);
    }

    double getCosineSimilarity() {
        return (v1.dotProduct(v2)) / (v1.getNorm() * v2.getNorm());
    }

    public static double getCosineSimilarity(String s1, String s2)
            throws IOException {
        return new CosineDocumentSimilarity(s1, s2).getCosineSimilarity();
    }

    Map<String, Integer> getTermFrequencies(IndexReader reader, int docId)
            throws IOException {
        Terms vector = reader.getTermVector(docId, CONTENT);
        TermsEnum termsEnum = null;
        termsEnum = vector.iterator(termsEnum);
        Map<String, Integer> frequencies = new HashMap<>();
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            int freq = (int) termsEnum.totalTermFreq();
            frequencies.put(term, freq);
            terms.add(term);
        }
        return frequencies;
    }

    RealVector toRealVector(Map<String, Integer> map) {
        RealVector vector = new ArrayRealVector(terms.size());
        int i = 0;
        for (String term : terms) {
            int value = map.containsKey(term) ? map.get(term) : 0;
            vector.setEntry(i++, value);
        }
        return (RealVector) vector.mapDivide(vector.getL1Norm());
    }
}
Mark Butler
  • 4,361
  • 2
  • 39
  • 39
  • 1
    Has VecTextField been taken from [this](http://stackoverflow.com/questions/11945728/how-to-use-termvector-lucene-4-0) question? – o_nix Apr 26 '13 at 15:08
  • I'm test this with Sujit Pal example : Document#0: Mitral valve surgery - minimally invasive (31825) Document#1: Mitral valve surgery - open (31835) Document#2: Laryngectomy (31706) but it got difference reuslt ! can you explain why Thanks – tiendv Jul 25 '13 at 10:30
  • @tiendv how did you get Sujit Pal's documents? He does not provide a link to their contents on his web page? He just lists their titles? If you just used the document titles you will get a big difference because those document titles are very different. – Mark Butler Jul 26 '13 at 00:11
  • yes i see , I have been check this . Sujit Pal's result not true – tiendv Jul 26 '13 at 01:48
  • No, his results may be right - we don't know - he simply has not given enough information to repeat his experiment. – Mark Butler Jul 26 '13 at 03:35
  • Because he said that : Document#0: Mitral valve surgery - minimally invasive (31825) Document#1: Mitral valve surgery - open (31835) Document#2: Laryngectomy (31706) Document#3: Shaken baby syndrome (30 then he said cosim(0,1)=0.9672563304581603 cosim(0,2)=0.7496880467405691 cosim(0,3)=0.23968406019250035 so i thought that true but i'm wrong – tiendv Jul 26 '13 at 08:05
  • What is `terms` in the last line? – Adam_G Nov 16 '17 at 14:48
  • For the latest version of Lucene, the above could would need to be changed slightly in the following manner: Instead of "TYPE_STORED.setIndexed(true);", the following can be written:
    TYPE_STORED.setIndexOptions(IndexOptions.DOCS);
    
    And, within createIndex function, the following can be used:
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    
    – Ajitesh Apr 04 '18 at 11:29
13

When indexing, there's an option to store term frequency vectors.

During runtime, look up the term frequency vectors for both documents using IndexReader.getTermFreqVector(), and look up document frequency data for each term using IndexReader.docFreq(). That will give you all the components necessary to calculate the cosine similarity between the two docs.

An easier way might be to submit doc A as a query (adding all words to the query as OR terms, boosting each by term frequency) and look for doc B in the result set.

bajafresh4life
  • 12,491
  • 5
  • 37
  • 46
  • Yes ok for the first, i use the termfreqvector to get what i want, but i wanted to check how much faster would it be the to get similarity from lucene. For the second part of your answer, i checked in the javadoc that there is not an obvious way to get similarity score. Ok, i can look for doc B in the result set but the only i can get is its position in the TopDocs, not the exact similarity score between these two document vectors that i want. – maiky Dec 09 '09 at 15:40
3

It's a very good solution by Mark Butler, however the calculations of the tf/idf weights are WRONG!

Term-Frequency (tf): how much this term appeared in this document (NOT all documents as in the code with termsEnum.totalTermFreq()).

Document Frequency (df): the total number of documents that this term appeared in.

Inverse Document Frequency: idf = log(N/df), where N is the total number of documents.

Tf/idf weight = tf * idf, for a given term and a given document.

I was hoping for an efficient calculation using Lucene! I'm unable to find a an efficient calculation for the correct if/idf weights.

EDIT: I made this code to calculate the weights as tf/idf weights and not as pure term-frequency. It works pretty well, but I wonder if there's a more efficient way.

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.math3.linear.ArrayRealVector;
import org.apache.commons.math3.linear.RealVector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;

public class CosineSimeTest {

    public static void main(String[] args) {
        try {
            CosineSimeTest cosSim = new 
                    CosineSimeTest( "This is good", 
                            "This is good" );
            System.out.println( cosSim.getCosineSimilarity() );
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static final String CONTENT = "Content";
    public static final int N = 2;//Total number of documents

    private final Set<String> terms = new HashSet<>();
    private final RealVector v1;
    private final RealVector v2;

    CosineSimeTest(String s1, String s2) throws IOException {
        Directory directory = createIndex(s1, s2);
        IndexReader reader = DirectoryReader.open(directory);
        Map<String, Double> f1 = getWieghts(reader, 0);
        Map<String, Double> f2 = getWieghts(reader, 1);
        reader.close();
        v1 = toRealVector(f1);
        System.out.println( "V1: " +v1 );
        v2 = toRealVector(f2);
        System.out.println( "V2: " +v2 );
    }

    Directory createIndex(String s1, String s2) throws IOException {
        Directory directory = new RAMDirectory();
        Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_CURRENT);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT,
                analyzer);
        IndexWriter writer = new IndexWriter(directory, iwc);
        addDocument(writer, s1);
        addDocument(writer, s2);
        writer.close();
        return directory;
    }

    /* Indexed, tokenized, stored. */
    public static final FieldType TYPE_STORED = new FieldType();

    static {
        TYPE_STORED.setIndexed(true);
        TYPE_STORED.setTokenized(true);
        TYPE_STORED.setStored(true);
        TYPE_STORED.setStoreTermVectors(true);
        TYPE_STORED.setStoreTermVectorPositions(true);
        TYPE_STORED.freeze();
    }

    void addDocument(IndexWriter writer, String content) throws IOException {
        Document doc = new Document();
        Field field = new Field(CONTENT, content, TYPE_STORED);
        doc.add(field);
        writer.addDocument(doc);
    }

    double getCosineSimilarity() {
        double dotProduct = v1.dotProduct(v2);
        System.out.println( "Dot: " + dotProduct);
        System.out.println( "V1_norm: " + v1.getNorm() + ", V2_norm: " + v2.getNorm() );
        double normalization = (v1.getNorm() * v2.getNorm());
        System.out.println( "Norm: " + normalization);
        return dotProduct / normalization;
    }


    Map<String, Double> getWieghts(IndexReader reader, int docId)
            throws IOException {
        Terms vector = reader.getTermVector(docId, CONTENT);
        Map<String, Integer> docFrequencies = new HashMap<>();
        Map<String, Integer> termFrequencies = new HashMap<>();
        Map<String, Double> tf_Idf_Weights = new HashMap<>();
        TermsEnum termsEnum = null;
        DocsEnum docsEnum = null;


        termsEnum = vector.iterator(termsEnum);
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            int docFreq = termsEnum.docFreq();
            docFrequencies.put(term, reader.docFreq( new Term( CONTENT, term ) ));

            docsEnum = termsEnum.docs(null, null);
            while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                termFrequencies.put(term, docsEnum.freq());
            }

            terms.add(term);
        }

        for ( String term : docFrequencies.keySet() ) {
            int tf = termFrequencies.get(term);
            int df = docFrequencies.get(term);
            double idf = ( 1 + Math.log(N) - Math.log(df) );
            double w = tf * idf;
            tf_Idf_Weights.put(term, w);
            //System.out.printf("Term: %s - tf: %d, df: %d, idf: %f, w: %f\n", term, tf, df, idf, w);
        }

        System.out.println( "Printing docFrequencies:" );
        printMap(docFrequencies);

        System.out.println( "Printing termFrequencies:" );
        printMap(termFrequencies);

        System.out.println( "Printing if/idf weights:" );
        printMapDouble(tf_Idf_Weights);
        return tf_Idf_Weights;
    }

    RealVector toRealVector(Map<String, Double> map) {
        RealVector vector = new ArrayRealVector(terms.size());
        int i = 0;
        double value = 0;
        for (String term : terms) {

            if ( map.containsKey(term) ) {
                value = map.get(term);
            }
            else {
                value = 0;
            }
            vector.setEntry(i++, value);
        }
        return vector;
    }

    public static void printMap(Map<String, Integer> map) {
        for ( String key : map.keySet() ) {
            System.out.println( "Term: " + key + ", value: " + map.get(key) );
        }
    }

    public static void printMapDouble(Map<String, Double> map) {
        for ( String key : map.keySet() ) {
            System.out.println( "Term: " + key + ", value: " + map.get(key) );
        }
    }

}
Jack Twain
  • 6,273
  • 15
  • 67
  • 107
  • Thanks for your feedback but as I understand it you do not need to calculate TF-IDF to calculate cosine similarity. You could calculate a similarity metric using TF-IDF if you want, but that was not the aim of the code above. Specifically I am using the algorithm above to test how well some automatic extraction code works against some human generated answers on a per document basis. TF-IDF would not help in that case, which is why I did not use it. – Mark Butler Jun 08 '13 at 02:13
  • Also I am happy to work with you optimising your code and I can see a few basic things you could do but it would be better if you posted it under a new question as this one did not mention TF-IDF? You could always cite this question? – Mark Butler Jun 08 '13 at 04:49
2

I know question has been answered, but for the people who might come here in future, nice example of the solution can be found here:

http://sujitpal.blogspot.ch/2011/10/computing-document-similarity-using.html

Julia
  • 1,217
  • 8
  • 23
  • 46
1

you can find better solution @ http://darakpanand.wordpress.com/2013/06/01/document-comparison-by-cosine-methodology-using-lucene/#more-53 . following are the steps

  • java code which builds term vector from content with the help of Lucene(check:http://lucene.apache.org/core/).
  • By using commons-math.jar library cosine calculation between two documents is done.
1

Calculating Cosine Similarity in Lucene Version 4.x is different than those from 3.x. Following post has detailed explanation with all the necessary code for calculating cosine similarity in Lucene 4.10.2. ComputerGodzilla: Calculated Cosine Similarity in Lucene!

Mubin Shrestha
  • 398
  • 3
  • 22
0

If you don't need to store documents to Lucene and just want to calculate similarity between two docs, here's the faster code (Scala, from my blog http://chepurnoy.org/blog/2014/03/faster-cosine-similarity-between-two-dicuments-with-scala-and-lucene/ )

def extractTerms(content: String): Map[String, Int] = {    
     val analyzer = new StopAnalyzer(Version.LUCENE_46)
     val ts = new EnglishMinimalStemFilter(analyzer.tokenStream("c", content))
     val charTermAttribute = ts.addAttribute(classOf[CharTermAttribute])

     val m = scala.collection.mutable.Map[String, Int]()

     ts.reset()
     while (ts.incrementToken()) {
         val term = charTermAttribute.toString
         val newCount = m.get(term).map(_ + 1).getOrElse(1)
         m += term -> newCount       
     }

     m.toMap
 }

def similarity(t1: Map[String, Int], t2: Map[String, Int]): Double = {
     //word, t1 freq, t2 freq
     val m = scala.collection.mutable.HashMap[String, (Int, Int)]()

     val sum1 = t1.foldLeft(0d) {case (sum, (word, freq)) =>
         m += word ->(freq, 0)
         sum + freq
     }

     val sum2 = t2.foldLeft(0d) {case (sum, (word, freq)) =>
         m.get(word) match {
             case Some((freq1, _)) => m += word ->(freq1, freq)
             case None => m += word ->(0, freq)
         }
         sum + freq
     }

     val (p1, p2, p3) = m.foldLeft((0d, 0d, 0d)) {case ((s1, s2, s3), e) =>
         val fs = e._2
         val f1 = fs._1 / sum1
         val f2 = fs._2 / sum2
         (s1 + f1 * f2, s2 + f1 * f1, s3 + f2 * f2)
     }

     val cos = p1 / (Math.sqrt(p2) * Math.sqrt(p3))
     cos
 }  

So, to calculate similarity between text1 and text2 just call similarity(extractTerms(text1), extractTerms(text2))