1

Currently I try to get the cosine similarity between two document with Lucene (4.10.4). I already read this answer about cosine similarity with Lucene , and I used this example to understand how it works with Lucene. But when I tested with 2 same words per each document (ex: "Hello world"), I've got a cosine of similarity at 0.9999999999999998

My code look like that:

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.math3.linear.ArrayRealVector;
import org.apache.commons.math3.linear.RealVector;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

public class CosineSimeTest {

    static String indexName = "/tmp/CosineExample";
    public static final String CONTENT = "field";
    public static final int N = 2;
    private final Set<String> terms = new HashSet<>();
    private final RealVector v1;
    private final RealVector v2;

    public static void main(String[] args) {
    try {
        CosineSimeTest cosSim = new CosineSimeTest("hello world", "hello world");
        System.out.println(cosSim.getCosineSimilarity());
    } catch (IOException e) {
        e.printStackTrace();
    }
    }

    public CosineSimeTest(String s1, String s2) throws IOException {
    Directory directory = createIndex(s1, s2);
    IndexReader reader = DirectoryReader.open(directory);
    Map<String, Double> f1 = getWieghts(reader, 0);
    Map<String, Double> f2 = getWieghts(reader, 1);
    reader.close();
    v1 = toRealVector(f1);
    System.out.println("V1: " + v1);
    v2 = toRealVector(f2);
    System.out.println("V2: " + v2);
    }

    public Directory createIndex(String s1, String s2) throws IOException {
    File f = new File(indexName);
    if (f.exists()) {
        FileUtils.deleteDirectory(f);
    }
    Directory directory = FSDirectory.open(new File(indexName));
    StandardAnalyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    addDocument(writer, s1);
    addDocument(writer, s2);
    writer.close();
    return directory;
    }

    public void addDocument(IndexWriter writer, String data) throws IOException {
    Document doc = new Document();

    FieldType type = new FieldType();
    type.setIndexed(true);
    type.setStoreTermVectors(true);
    type.setStoreTermVectorPositions(true);
    type.freeze();
    Field field = new Field(CONTENT, data, type);

    doc.add(field);
    writer.addDocument(doc);
    }

    public double getCosineSimilarity() {
    double dotProduct = v1.dotProduct(v2);
    System.out.println("Dot: " + dotProduct);
    System.out.println("V1_norm: " + v1.getNorm() + ", V2_norm: " + v2.getNorm());
    double normalization = (v1.getNorm() * v2.getNorm());
    System.out.println("Norm: " + normalization);
    return dotProduct / normalization;
    }

    public Map<String, Double> getWieghts(IndexReader reader, int docId) throws IOException {
    Terms vector = reader.getTermVector(docId, CONTENT);
    Map<String, Integer> docFrequencies = new HashMap<>();
    Map<String, Integer> termFrequencies = new HashMap<>();
    Map<String, Double> tf_Idf_Weights = new HashMap<>();
    TermsEnum termsEnum = null;
    DocsEnum docsEnum = null;

    termsEnum = vector.iterator(termsEnum);
    BytesRef text = null;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        docFrequencies.put(term, reader.docFreq(new Term(CONTENT, term)));

        docsEnum = termsEnum.docs(null, null);
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        termFrequencies.put(term, docsEnum.freq());
        }
        terms.add(term);
    }

    for (String term : docFrequencies.keySet()) {
        int tf = termFrequencies.get(term);
        int df = docFrequencies.get(term);
        double idf = (1 + Math.log(N) - Math.log(df));
        double w = tf * idf;
        tf_Idf_Weights.put(term, w);
    }

    // System.out.println("Printing docFrequencies:");
    // printMap(docFrequencies);
    //
    // System.out.println("Printing termFrequencies:");
    // printMap(termFrequencies);
    //
    // System.out.println("Printing if/idf weights:");
    // printMapDouble(tf_Idf_Weights);
    return tf_Idf_Weights;
    }

    public RealVector toRealVector(Map<String, Double> map) {
    RealVector vector = new ArrayRealVector(terms.size());
    int i = 0;
    double value = 0;
    for (String term : terms) {
        if (map.containsKey(term)) {
        value = map.get(term);
        } else {
        value = 0;
        }
        vector.setEntry(i++, value);
    }
    return vector;
    }

    public static void printMap(Map<String, Integer> map) {
    for (String key : map.keySet()) {
        System.out.println("Term: " + key + ", value: " + map.get(key));
    }
    }

    public static void printMapDouble(Map<String, Double> map) {
    for (String key : map.keySet()) {
        System.out.println("Term: " + key + ", value: " + map.get(key));
    }
    }

    public void getVersionOfLucene(StandardAnalyzer analyzer) {
    System.out.println("version : " + analyzer.getVersion());
    }

}

What is the problem ? How to fix this ?

Thanks in advance.

Community
  • 1
  • 1
pi-2r
  • 1,259
  • 4
  • 27
  • 52
  • 2
    It looks like it might be floating point errors (certain decimal numbers cannot be represented exactly in binary, so the represented value may be slightly off from the actual value) – Qwerty01 Nov 17 '15 at 17:58
  • Possible duplicate of [Is floating point math broken?](http://stackoverflow.com/questions/588004/is-floating-point-math-broken) – femtoRgon Nov 18 '15 at 00:06

0 Answers0