Avoid saving small numbers as they are, they go into underflow problems (see What are arithmetic underflow and overflow in C?), dividing a small number with another will give you even more underflow problem, so do this to preprocess your relative frequencies as such:
>>> import math
>>> num = 1.435486010883783160220299732E-8
>>> logged = math.log(num)
>>> logged
-18.0591772685384
>>> math.exp(logged)
1.4354860108837844e-08
Now to the reading of the csv
. Since you're only manipulating the relative frequencies, your 2nd column don't matter, so let's skip that and save the first column (i.e. the phrases) as key and third column (i.e. relative freq) as value:
import csv, math
# Writes a dummy csv file as example.
textfile = """drinks while strutting, 4, 1.435486010883783160220299732E-8
and since that, 6, 4.306458032651349480660899195E-8
the state face, 3, 2.153229016325674740330449597E-8"""
textfile2 = """and since that, 3, 2.1532290163256747e-08
the state face, 1, 7.1774300544189156e-09
drinks while strutting, 2, 7.1774300544189156e-09
some silly ngram, 99, 1.235492312e-09"""
with open('ngrams-1.csv', 'w') as fout:
for line in textfile.split('\n'):
fout.write(line + '\n')
with open('ngrams-2.csv', 'w') as fout:
for line in textfile2.split('\n'):
fout.write(line + '\n')
# Read and save the two files into a dict structure
ngramfile1 = 'ngrams-1.csv'
ngramfile2 = 'ngrams-2.csv'
ngramdict1 = {}
ngramdict2 = {}
with open(ngramfile1, 'r') as fin:
reader = csv.reader(fin, delimiter=',')
for row in reader:
phrase, raw, rel = row
ngramdict1[phrase] = math.log(float(rel))
with open(ngramfile2, 'r') as fin:
reader = csv.reader(fin, delimiter=',')
for row in reader:
phrase, raw, rel = row
ngramdict2[phrase] = math.log(float(rel))
Now for the tricky part you want division of the relative frequency of ngramdict2's phrases by ngramdict1's phrases, i.e.:
if phrase_from_ngramdict1 == phrase_from_ngramdict2:
relfreq = relfreq_from_ngramdict2 / relfreq_from_ngramdict1
Since we kept the relative frequencies in logarithic units, we don't have to divide but to simply subtract it, i.e.
if phrase_from_ngramdict1 == phrase_from_ngramdict2:
logrelfreq = logrelfreq_from_ngramdict2 - logrelfreq_from_ngramdict1
And to get the phrases that occurs in both, you wont need to check the phrases one by one simply use cast the dictionary.keys()
into a set and then doset1.intersection(set2)
, see https://docs.python.org/2/tutorial/datastructures.html
phrases1 = set(ngramdict1.keys())
phrases2 = set(ngramdict2.keys())
overlap_phrases = phrases1.intersection(phrases2)
print overlap_phrases
[out]:
set(['drinks while strutting', 'the state face', 'and since that'])
So now let's print it out with the relative frequencies:
with open('ngramcombined.csv', 'w') as fout:
for p in overlap_phrases:
relfreq1 = ngramdict1[p]
relfreq2 = ngramdict2[p]
combined_relfreq = relfreq2 - relfreq1
fout.write(",".join([p, str(combined_relfreq)])+ '\n')
The ngramcombined.csv
looks like this:
drinks while strutting,-0.69314718056
the state face,-1.09861228867
and since that,-0.69314718056
Here's the full code:
import csv, math
# Writes a dummy csv file as example.
textfile = """drinks while strutting, 4, 1.435486010883783160220299732E-8
and since that, 6, 4.306458032651349480660899195E-8
the state face, 3, 2.153229016325674740330449597E-8"""
textfile2 = """and since that, 3, 2.1532290163256747e-08
the state face, 1, 7.1774300544189156e-09
drinks while strutting, 2, 7.1774300544189156e-09
some silly ngram, 99, 1.235492312e-09"""
with open('ngrams-1.csv', 'w') as fout:
for line in textfile.split('\n'):
fout.write(line + '\n')
with open('ngrams-2.csv', 'w') as fout:
for line in textfile2.split('\n'):
fout.write(line + '\n')
# Read and save the two files into a dict structure
ngramfile1 = 'ngrams-1.csv'
ngramfile2 = 'ngrams-2.csv'
ngramdict1 = {}
ngramdict2 = {}
with open(ngramfile1, 'r') as fin:
reader = csv.reader(fin, delimiter=',')
for row in reader:
phrase, raw, rel = row
ngramdict1[phrase] = math.log(float(rel))
with open(ngramfile2, 'r') as fin:
reader = csv.reader(fin, delimiter=',')
for row in reader:
phrase, raw, rel = row
ngramdict2[phrase] = math.log(float(rel))
# Find the intersecting phrases.
phrases1 = set(ngramdict1.keys())
phrases2 = set(ngramdict2.keys())
overlap_phrases = phrases1.intersection(phrases2)
# Output to new file.
with open('ngramcombined.csv', 'w') as fout:
for p in overlap_phrases:
relfreq1 = ngramdict1[p]
relfreq2 = ngramdict2[p]
combined_relfreq = relfreq2 - relfreq1
fout.write(",".join([p, str(combined_relfreq)])+ '\n')
If you like SUPER UNREADBLE but short code (in no. of lines):
import csv, math
# Read and save the two files into a dict structure
ngramfile1 = 'ngrams-1.csv'
ngramfile2 = 'ngrams-2.csv'
ngramdict1 = {row[0]:math.log(float(row[2])) for row in csv.reader(open(ngramfile1, 'r'), delimiter=',')}
ngramdict2 = {row[0]:math.log(float(row[2])) for row in csv.reader(open(ngramfile2, 'r'), delimiter=',')}
# Find the intersecting phrases.
overlap_phrases = set(ngramdict1.keys()).intersection(set(ngramdict2.keys()))
# Output to new file.
with open('ngramcombined.csv', 'w') as fout:
for p in overlap_phrases:
fout.write(",".join([p, str(ngramdict2[p] - ngramdict1[p])])+ '\n')