Basically, I am trying to implement this... https://www.youtube.com/watch?v=sI7VpFNiy_I&t=20m
Another person was working on the step after what I am stuck here Second-order cooccurrence of terms in texts
My questions are... Why is function "A" failing in my example? How can I make function "A" work in my example?
In the above video, Thomas defines a function(below, function "A" ), which takes a co occurrence matrix and returns a correlation matrix.
I am just getting out of pandas land and into scipy, so I have no idea what is going on in the code, or if the code still works at all.
def _association_matrix(self,matrix):
O=matrix.copy()
#N compute expected matrix
N=O.sum(1).sum()
R=O.copy()
R.data=np.ones(len(R.data))
R.data=R.data/N
Fx_old=np.array(O.sum(1).flatten())
Fy_old=np.array(O.sum(0).flatten())
Fx=spdiags(Fx_old,0,Fx_old.shape[0],Fx_old.shape[0])
Fy=spdiags(Fy_old,0,Fy_old.shape[0],Fy_old.shape[0])
Fx=Fx.tocsc()
Fy=Fy.tocsc()
E=Fx*R*Fy
E.tocsc()
#compute poisson association measure
poi=E.copy()
poi.data=(np.sign(O.data-E.data)*
(O.data*np.log(O.data/E.data)-
(O.data-E.data)))
return poi
I have tried the code on a co occurrence matrix I made(code below) and I get the following error: ValueError: could not broadcast input array from shape (5158) into shape (1)
from nltk.corpus import state_union
StateofUnion=[]
for i in state_union.fileids():
StateofUnion.append(state_union.raw(i))
import pandas as pd
#I didn't download most of nltk so just loading all of nltk isnt heavy for me, but would be heavy for most users
import nltk
years=pd.Series(state_union.fileids()).str.slice(stop=4)
presidents=pd.Series(state_union.fileids()).str.slice(start=5,stop=-4)
presidents=presidents.str.replace('|'.join(['1','2','-']),'')
party=[]
for i in presidents:
if i=='Truman':
party.append('dem')
elif i=='Eisenhower':
party.append('rep')
elif i=='Kennedy':
party.append('dem')
elif i=='Johnson':
party.append('dem')
elif i=='Johnson-2':
party.append('dem')
elif i=='Johnson-1':
party.append('dem')
elif i=='Nixon':
party.append('rep')
elif i=='Ford':
party.append('rep')
elif i=='Carter':
party.append('dem')
elif i=='Reagan':
party.append('rep')
elif i=='Bush':
party.append('rep')
elif i=='Bush-1':
party.append('rep')
elif i=='Bush-2':
party.append('rep')
elif i=='Clinton':
party.append('dem')
elif i=='GWBush':
party.append('rep')
elif i=='GWBush-1':
party.append('rep')
elif i=='GWBush-2':
party.append('rep')
fileid=pd.Series(state_union.fileids()).str.slice(stop=-4)
SOUA=pd.DataFrame({'Address':StateofUnion,
'Year':years,
'President':presidents,
'Party':party,
'recid':fileid})
#A simple cleaning function, yes it's bad- its also good enough for now
def scrub(pandascolumn):
#assumes we are dealing with pandas
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop = stopwords.words('english')
#Destroys Sentence Tokenization
badchars=['!', '@', '#','$', '%','ą','ś',
'^','&','\.',',','\'','\)',
'\(','\[','\]','\?','\:',
'\--','\;']
pattern = '|'.join(badchars)
return pandascolumn.str.lower().\
str.replace(pattern,'').\
apply(word_tokenize).\
apply(lambda x: [item for item in x if item not in stop]).\
apply(' '.join)
SOUA=SOUA.set_index('recid')
SOUA['ScrubbedAddress']=scrub(SOUA['Address'])
def get_skip_bigram(Series,n,k):
return Series.str.split().apply(lambda x: list(nltk.skipgrams(x, n, k)))
grams=get_skip_bigram(SOUA[SOUA['President']=='Clinton']['ScrubbedAddress'],2,1)
skiplist=[]
for i in range(0,len(grams)):
for j in grams[i]:
skiplist.append(list(j))
def get_vocab(doc):
vocab=set()
for i in range(0,len(doc)):
vocab.update(doc[i])
return vocab
vocab=get_vocab(skiplist)
import scipy.sparse as sp
voc2id = dict(zip(vocab,range(len(vocab))))
rows, cols, vals = [],[],[]
for r,d in enumerate(skiplist):
for e in d:
rows.append(r)
cols.append(voc2id[e])
vals.append(1)
X=sp.csc_matrix((vals,(rows,cols)))
Xc=X.T*X
Xc.setdiag(0)
print(Xc.toarray())
from scipy import sparse
def _association_matrix(matrix):
O=matrix.copy()
#N compute expected matrix
N=O.sum(1).sum()
R=O.copy()
R.data=np.ones(len(R.data))
R.data=R.data/N
Fx_old=np.array(O.sum(1).flatten())
Fy_old=np.array(O.sum(0).flatten())
Fx=spdiags(Fx_old,0,Fx_old.shape[0],Fx_old.shape[0])
Fy=spdiags(Fy_old,0,Fy_old.shape[0],Fy_old.shape[0])
Fx=Fx.tocsc()
Fy=Fy.tocsc()
E=Fx*R*Fy
E.tocsc()
#compute poisson association measure
poi=E.copy()
poi.data=(np.sign(O.data-E.data)*
(O.data*np.log(O.data/E.data)-
(O.data-E.data)))
return poi
_association_matrix(Xc)
The part that is producing the error is "Fx.tocsc()", I do not understand why or how to fix the error.
Again, I have 2 pressing questions. Why is function "A" failing in my example? How can I make function "A" work in my example?
Beyond these 3, obviously an implementation of Thomas's work flow
would be pretty nice for the topic modeling community. That's what we are working towards. If you can tackle that head on, even better. topic model workflow