How to obtain the cosine similarity with Lucene

Question

Currently I try to get the cosine similarity between two document with Lucene (4.10.4). I already read this answer about cosine similarity with Lucene , and I used this example to understand how it works with Lucene. But when I tested with 2 same words per each document (ex: "Hello world"), I've got a cosine of similarity at 0.9999999999999998

My code look like that:

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.math3.linear.ArrayRealVector;
import org.apache.commons.math3.linear.RealVector;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

public class CosineSimeTest {

    static String indexName = "/tmp/CosineExample";
    public static final String CONTENT = "field";
    public static final int N = 2;
    private final Set<String> terms = new HashSet<>();
    private final RealVector v1;
    private final RealVector v2;

    public static void main(String[] args) {
    try {
        CosineSimeTest cosSim = new CosineSimeTest("hello world", "hello world");
        System.out.println(cosSim.getCosineSimilarity());
    } catch (IOException e) {
        e.printStackTrace();
    }
    }

    public CosineSimeTest(String s1, String s2) throws IOException {
    Directory directory = createIndex(s1, s2);
    IndexReader reader = DirectoryReader.open(directory);
    Map<String, Double> f1 = getWieghts(reader, 0);
    Map<String, Double> f2 = getWieghts(reader, 1);
    reader.close();
    v1 = toRealVector(f1);
    System.out.println("V1: " + v1);
    v2 = toRealVector(f2);
    System.out.println("V2: " + v2);
    }

    public Directory createIndex(String s1, String s2) throws IOException {
    File f = new File(indexName);
    if (f.exists()) {
        FileUtils.deleteDirectory(f);
    }
    Directory directory = FSDirectory.open(new File(indexName));
    StandardAnalyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    addDocument(writer, s1);
    addDocument(writer, s2);
    writer.close();
    return directory;
    }

    public void addDocument(IndexWriter writer, String data) throws IOException {
    Document doc = new Document();

    FieldType type = new FieldType();
    type.setIndexed(true);
    type.setStoreTermVectors(true);
    type.setStoreTermVectorPositions(true);
    type.freeze();
    Field field = new Field(CONTENT, data, type);

    doc.add(field);
    writer.addDocument(doc);
    }

    public double getCosineSimilarity() {
    double dotProduct = v1.dotProduct(v2);
    System.out.println("Dot: " + dotProduct);
    System.out.println("V1_norm: " + v1.getNorm() + ", V2_norm: " + v2.getNorm());
    double normalization = (v1.getNorm() * v2.getNorm());
    System.out.println("Norm: " + normalization);
    return dotProduct / normalization;
    }

    public Map<String, Double> getWieghts(IndexReader reader, int docId) throws IOException {
    Terms vector = reader.getTermVector(docId, CONTENT);
    Map<String, Integer> docFrequencies = new HashMap<>();
    Map<String, Integer> termFrequencies = new HashMap<>();
    Map<String, Double> tf_Idf_Weights = new HashMap<>();
    TermsEnum termsEnum = null;
    DocsEnum docsEnum = null;

    termsEnum = vector.iterator(termsEnum);
    BytesRef text = null;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        docFrequencies.put(term, reader.docFreq(new Term(CONTENT, term)));

        docsEnum = termsEnum.docs(null, null);
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        termFrequencies.put(term, docsEnum.freq());
        }
        terms.add(term);
    }

    for (String term : docFrequencies.keySet()) {
        int tf = termFrequencies.get(term);
        int df = docFrequencies.get(term);
        double idf = (1 + Math.log(N) - Math.log(df));
        double w = tf * idf;
        tf_Idf_Weights.put(term, w);
    }

    // System.out.println("Printing docFrequencies:");
    // printMap(docFrequencies);
    //
    // System.out.println("Printing termFrequencies:");
    // printMap(termFrequencies);
    //
    // System.out.println("Printing if/idf weights:");
    // printMapDouble(tf_Idf_Weights);
    return tf_Idf_Weights;
    }

    public RealVector toRealVector(Map<String, Double> map) {
    RealVector vector = new ArrayRealVector(terms.size());
    int i = 0;
    double value = 0;
    for (String term : terms) {
        if (map.containsKey(term)) {
        value = map.get(term);
        } else {
        value = 0;
        }
        vector.setEntry(i++, value);
    }
    return vector;
    }

    public static void printMap(Map<String, Integer> map) {
    for (String key : map.keySet()) {
        System.out.println("Term: " + key + ", value: " + map.get(key));
    }
    }

    public static void printMapDouble(Map<String, Double> map) {
    for (String key : map.keySet()) {
        System.out.println("Term: " + key + ", value: " + map.get(key));
    }
    }

    public void getVersionOfLucene(StandardAnalyzer analyzer) {
    System.out.println("version : " + analyzer.getVersion());
    }

}

What is the problem ? How to fix this ?

Thanks in advance.

It looks like it might be floating point errors (certain decimal numbers cannot be represented exactly in binary, so the represented value may be slightly off from the actual value) — Qwerty01, Nov 17 '15 at 17:58
Possible duplicate of [Is floating point math broken?](http://stackoverflow.com/questions/588004/is-floating-point-math-broken) — femtoRgon, Nov 18 '15 at 00:06

How to obtain the cosine similarity with Lucene

0 Answers0