Creating a |N| x |M| matrix from a hash-table

Question

Imagine I have a dictionary / hashtable of pairs of strings (keys) and their respective probabilities (values):

import numpy as np
import random
import uuid

# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(8000,10000)
m_vocab_size = random.randint(8000,10000)

def random_word(): 
    return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])

# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]


# Let's hallucinate probabilities for each word pair.
hashes =  {(n, m): random.random() for n in n_vocab for m in m_vocab}

The hashes hashtable will look something like this:

{('585F', 'B4867'): 0.7582038699473549,
 ('69', 'D98B23C5809A'): 0.7341569569849136,
 ('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
 ('DD8F8AFA5CF', 'CB'): 0.4609114677237601,
...
}

Imagine that this is the input hashtable that I'll read from CSV file with the first and second column being the word pairs (keys) of the hashtable and the third column the probabilities

If I were to put the probabilities into some sort of numpy matrix, I would have to do this from the hashtable:

 n_words, m_words = zip(*hashes.keys())
 probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])

Is there another way to get the prob into the |N| * |M| matrix from the hashtable without doing a nested loop through the m_vocab and n_vocab?

(Note: I'm creating random words and random probabilities here but imagine I have read the hash table from a file and it's read into that hashtable structure)

Assume both scenarios, where:

The hashtable is from a csv file (@bunji's answer resolves this)
The hashtable comes from a pickled dictionary. Or that the hashtable was computed some other way before reaching the part where converting it into a matrix is necessary.

It is important that the final matrix needs to be queryable, the following isn't desirable:

$ echo -e 'abc\txyz\t0.9\nefg\txyz\t0.3\nlmn\topq\t\0.23\nabc\tjkl\t0.5\n' > test.txt

$ cat test.txt
abc xyz 0.9
efg xyz 0.3
lmn opq .23
abc jkl 0.5


$ python
Python 2.7.10 (default, Jul 30 2016, 18:31:42) 
[GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import pandas as pd
>>> pt = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack().as_matrix()
>>> pt
array([[ 0.5,  nan,  0.9],
       [ nan,  nan,  0.3],
       [ nan,  nan,  nan]])
>>> pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
       2         
1    jkl opq  xyz
0                
abc  0.5 NaN  0.9
efg  NaN NaN  0.3
lmn  NaN NaN  NaN

>>> df = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()

>>> df
       2         
1    jkl opq  xyz
0                
abc  0.5 NaN  0.9
efg  NaN NaN  0.3
lmn  NaN NaN  NaN

>>> df['abc', 'jkl']
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1617, in get_loc
    return self._engine.get_loc(key)
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
  File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
  File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: ('abc', 'jkl')
>>> df['abc']['jkl']
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
    loc = self._get_level_indexer(key, level=0)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
    loc = level_index.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 163, in pandas.index.IndexEngine.get_loc (pandas/index.c:4090)
KeyError: 'abc'

>>> df[0][2]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
    loc = self._get_level_indexer(key, level=0)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
    loc = level_index.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
  File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
  File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0

>>> df[0]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
    loc = self._get_level_indexer(key, level=0)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
    loc = level_index.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
  File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
  File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0

The resulting matrix/dataframe should be queryable, i.e. is able to do something like:

probs[('585F', 'B4867')] = 0.7582038699473549

Could you use pandas for this? Creating a dataframe for a dictionary? Using both keys as two columns and the hash as another. You may be able to create a composite index after that. Just a guess. — Tammo Heeren, Oct 24 '16 at 01:47
And then `pandas.DataFrame.tonumpy()`? Is there such a function? Let me try. — alvas, Oct 24 '16 at 01:50
Your `uuid4().get_hex().upper()` might need to be changed to `uuid4().hex.upper()` for python 3.5 or so. — Tammo Heeren, Oct 24 '16 at 02:11
I actually have another |M| x 1 vector that corresponds to M, that needs to go through matrix multiplication to form |N| x |M| * |M| x 1 = |N| x 1 . Also, the matrix would be necessary to perform other statistical calculations, in `nlp`, the matrix would be called a co-occurence matrix. — alvas, Oct 24 '16 at 02:41
I played with this a little, but I can't get this done without the looping either. — Tammo Heeren, Oct 24 '16 at 05:33

Tadhg McDonald-Jensen · Accepted Answer · 2016-11-06T16:47:06.700

5

I'm not sure if there is a way to completely avoid looping but I imagine it could be optimized by using itertools:

import itertools
nested_loop_iter = itertools.product(n_vocab,m_vocab)
#note that because it iterates over n_vocab first we will need to transpose it at the end
probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
probs.resize((len(n_vocab),len(m_vocab)))
probs = probs.T

edited Nov 06 '16 at 16:47

answered Oct 30 '16 at 23:38

Tadhg McDonald-Jensen

20,699
5
35
59

score 4 · Answer 2 · edited Apr 13 '17 at 12:50

4

If your end goal is to read in your data from a .csv file, it might be easier to read the file directly using pandas.

import pandas as pd

df = pd.read_csv('coocurence_data.csv', index_col=[0,1], header=None).unstack()
probs = df.as_matrix()

this reads your data from the csv, makes the first two columns into a multi-index which corresponds to your two sets of words. It then unstacks the multi-index so that you have one set of words as the column labels and another as the index labels. This gives your your |N|*|M| matrix which can then be converted into a numpy array with the .as_matrix() function.

This doesn't really resolve your question about changing your {(n,m):prob} dictionary into a numpy array but given your intentions, this will allow you to avoid the need to create that dictionary altogether.

Also, if you're going to be reading in the csv anyway, reading it using pandas in the first place is going to be faster than using the builtin csv module anyway: see these benchmark tests here

EDIT

In order to query a specific value in your DataFrame based on the row and column labels, df.loc:

df.loc['xyz', 'abc']

where 'xyz' is your word in your row label and 'abc' is your column label. Also check out df.ix and df.iloc for other ways of querying specific cells in your DataFrame.

edited Apr 13 '17 at 12:50

Community

1
1

answered Oct 30 '16 at 16:52

bunji

5,063
1
17
36

It's true that if I read the file into a panda dataframe, then only the diagonal vector will be populated but if the hashtable is pickled as a `dict`, it might not be as convenient. Still this a good way to read the csv =) – alvas Oct 30 '16 at 23:09
Ah that's trick, the headers are not defined in the final probs matrix, making it un-queryable =( – alvas Oct 31 '16 at 03:18
@alvas you can query a cell in a DataFrame based on labels using the [`.loc`](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.loc.html) function. I will edit my answer above to demonstrate this. – bunji Oct 31 '16 at 16:01
1

`.values` is recommended over `.as_matrix()` – Paul H Oct 31 '16 at 16:13
Kind of strange when I tried on my dataset, it keeps saying my words are no in index when I used `df.loc`. – alvas Nov 21 '16 at 06:01
Seems like some python limitations is causing the indexerror issue http://stackoverflow.com/questions/40714519/indexerror-on-huge-list-in-python – alvas Nov 21 '16 at 07:04

score 3 · Answer 3 · answered Nov 04 '16 at 11:11

[a short extension of the answer of dr-xorile]

Most solution look good to me. Depends a little if you need speed or convenience.

I agree that you have basically a matrix in coo sparse format. You might want to look at https://docs.scipy.org/doc/scipy-0.18.1/reference/sparse.html

Only problem is that matrices need integer indices. So as long as you hashes are small enough to be quickly expressed as a np.int64 that should work. And the sparse format should allow an $O(1)$ access to all elements.

(Sorry for the brevity!)

rough outline

This could potentially be fast but is kind of hacky.

get the data in sparse representation. I think you should pick coo_matrix to just hold your 2D hash map.

a. load the CSV using numpy.fromtxt and use e.g. datatype ['>u8', '>u8', np.float32] to treat the hashes as string representations of unsigned 8byte integer numbers. If that does not work you might load strings and use numpy to convert it. Finally you have three tables of size N * M like your hash table and use these with the scipy sparse matrix representation of your choice.

b. if you have the object already in memory you might be able to use the sparse constructor directly

To access you need to parse your strings again

prob = matrix[np.fromstring(key1, dtype='>u8'), np.fromstring(key2, dtype='>u8')]

score 2 · Answer 4 · answered Nov 01 '16 at 22:24

It seems a bit inefficient to go through the entire n_vocab x m_vocab space for a sparse matrix! You could loop over the original hashes table. It would be good to know a couple of things first, of course:

Do you know the size of n_vocab and m_vocab upfront? Or are you going to figure that out as you build it?
Do you know if there are any repetitions in your hash table, and if so how will you handle it? It looks like hash is a dictionary, in which case, obviously the keys are unique. In practice, that probably means you're over-writing each time, and so the last value is what will stand.

In any event, here's a comparison of the two options:

from collections import defaultdict
import numpy as np

hashes = defaultdict(float,{('585F', 'B4867'): 0.7582038699473549,
 ('69', 'D98B23C5809A'): 0.7341569569849136,
 ('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
 ('DD8F8AFA5CF', 'CB'): 0.4609114677237601})

#Double loop approach
n_vocab, m_vocab = zip(*hashes.keys())
probs1 = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])

#Loop through the hash approach
n_hash = dict()  #Create a hash table to find the correct row number
for i,n in enumerate(n_vocab):
    n_hash[n] = i
m_hash = dict()  #Create a hash table to find the correct col number
for i,m in enumerate(m_vocab):
    m_hash[m] = i
probs2 = np.zeros((len(n_vocab),len(m_vocab)))
for (n,m) in hashes: #Loop through the hashes and put the values into the probs table
    probs2[n_hash[n],m_hash[m]] = hashes[(n,m)]

The output of probs1 and probs2 is, of course, the same:

>>> probs1
array([[ 0.73415696,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.46091147,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.75820387,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.91060772]])
>>> probs2
array([[ 0.73415696,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.46091147,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.75820387,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.91060772]])

And, of course, your code for probs1 is very terse. However, the size of the loops is substantially different, and it could make a big difference to the run time

score 2 · Answer 5 · answered Nov 01 '16 at 22:36

I tried to reduce sample size to quickly compare different codes. I coded dataframe method, which might still use for loop in pandas function, and compared to the original code and itertools code provided by Tadhg McDonald-Jensen. The fastest code is itertools.

In [3]: %timeit itertool(hashes,n_vocab,m_vocab)
1000 loops, best of 3: 1.12 ms per loop

In [4]: %timeit baseline(hashes,n_vocab,m_vocab)
100 loops, best of 3: 3.23 ms per loop

In [5]: %timeit dataframeMethod(hashes,n_vocab,m_vocab)
100 loops, best of 3: 5.49 ms per loop

This is the code I used to compare.

import numpy as np
import random
import uuid
import pandas as pd
import itertools

# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(80,100)
m_vocab_size = random.randint(80,100)

def random_word(): 
    return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])

# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]


# Let's hallucinate probabilities for each word pair.
hashes =  {(n, m): random.random() for n in n_vocab for m in m_vocab}

def baseline(hashes,n_vocab,m_vocab):
    n_words, m_words = zip(*hashes.keys())
    probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
    return probs

def itertool(hashes,n_vocab,m_vocab):
    nested_loop_iter = itertools.product(n_vocab,m_vocab)
    #note that because it iterates over n_vocab first we will need to transpose it at the end
    probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
    probs.resize((len(n_vocab),len(m_vocab)))
    return probs.T  

def dataframeMethod(hashes,n_vocab,m_vocab):
    # build dataframe from hashes
    id1 = pd.MultiIndex.from_tuples(hashes.keys())
    df=pd.DataFrame(hashes.values(),index=id1)
    # make dataframe with one index and one column
    df2=df.unstack(level=0)
    df2.columns = df2.columns.levels[1]
    return df2.loc[m_vocab,n_vocab].values

Creating a |N| x |M| matrix from a hash-table

5 Answers5

rough outline