1

I have a dataframe with a column of text and a column of keywords.

>>> main_df.head(3)
+-------+-----------------------------------------+---------------------------------------+    
| Index |                  Text                   |               Keywords                |    
+-------+-----------------------------------------+---------------------------------------+    
|     1 | "Here is some text"                     | ["here","text"]                       |     
|     2 | "Some red birds and blue elephants"     | ["red", "bird", "blue", "elephant"]   |    
|     3 | "Please help me with my pandas problem" | ["help", "pandas", "problem"]         |    
+-------+-----------------------------------------+---------------------------------------+    

I use itertools.combinations to make a dataframe with all possible combinations of keywords.

>>> edge_df.head(3)
+-------+--------+--------+    
| Index |  Src   |  Dst   |    
+-------+--------+--------+    
|     1 | "here" | "text" |    
|     2 | "here" | "red"  |    
|     3 | "here" | "bird" |    
+-------+--------+--------+    

I then apply a function that goes through each keyword pair and assigns a value in edge_df['weight'] which is how many times each keyword pair appear in the same piece of text/list of keywords.

>>> edge_df.head(3)
+-------+--------+--------+--------+    
| Index |  Src   |  Dst   | Weight |    
+-------+--------+--------+--------+    
|     1 | "here" | "text" |      1 |    
|     2 | "here" | "red"  |      3 |    
|     3 | "here" | "bird" |      8 |    
+-------+--------+--------+--------+    

My problem is that the code is very slow at the moment (1hr for 300 lines of short pieces of text). Below is the code I am using to get the edge_df and apply the function. Anything I can do to speed this up?

from itertools import combinations

def indexes_by_word(word1, word2):
    """
    Find the matching texts between two words.
    """
    indx1 = set(df[df['Keywords'].apply(lambda lst: word1 in lst)].index)
    indx2 = set(df[df['Keywords'].apply(lambda lst: word2 in lst)].index)
    return len(indx1.intersection(indx2))

# Make list of all unique words
unique_words = df['Keywords'].apply(pd.Series).stack().reset_index(drop=True).unique()

# Make an empty edgelist dataframe of our words
edges = pd.DataFrame(data=list(combinations(unique_words, 2)),
                     columns=['src', 'dst'])

edges['weight'] = edges.progress_apply(lambda x: indexes_by_word(x['src'], x['dst']), axis=1)

edges.head()
SultanOrazbayev
  • 14,900
  • 3
  • 16
  • 46
Bertil Johannes Ipsen
  • 1,656
  • 1
  • 14
  • 27
  • Things like `apply(lambda..` and `apply(pd.Series)` are significantly slowing down your code. – Erfan May 11 '19 at 13:05
  • Am I correct in understanding you're looking at all possible pairs of words rather than just the pairs that actually occur? It's probably much faster if you look at the pairs that occur together and then do something rather than looking at each possible pair and then checking whether they appear together. – Joel May 12 '19 at 00:46
  • Right, that makes a lot of sense! I'll try to think of a way to do that. – Bertil Johannes Ipsen May 12 '19 at 08:23

1 Answers1

1

Only a 10% improvement for having taken the apply out of indexes_by_word. Anyway, here is a busybox to A/B your code. Would appreciate seeing additional optimizations.

import pandas as pd
import numpy as np
from itertools import combinations
import timeit

df = pd.DataFrame([{"Text":"Here is some text","Keywords":["here","text"]},
{"Text":"Some red birds and blue elephants","Keywords":["red", "bird", "blue", "elephant"]},
{"Text":"Please help me with my pandas problem","Keywords":["help", "pandas", "problem"]}])

#https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows/40449726#40449726
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

keyword_index = explode(df,['Keywords'], preserve_index=True)['Keywords']

def first(df):
    def indexes_by_word_first(word1, word2):
        """
        Find the matching texts between two words.
        """
        indx1 = set(df[df['Keywords'].apply(lambda lst: word1 in lst)].index)
        indx2 = set(df[df['Keywords'].apply(lambda lst: word2 in lst)].index)
        return len(indx1.intersection(indx2))

    # Make list of all unique words
    unique_words = df['Keywords'].apply(pd.Series).stack().reset_index(drop=True).unique()

    # Make an empty edgelist dataframe of our words
    edges = pd.DataFrame(data=list(combinations(unique_words, 2)),
                         columns=['src', 'dst'])

    edges['weight'] = edges.apply(lambda x: indexes_by_word_first(x['src'], x['dst']), axis=1)

    return edges

def second(df):
    def indexes_by_word_second(word1, word2):
        """
        Find the matching texts between two words.
        """
        indx1 = set(keyword_index[keyword_index == word1].index.values)
        indx2 = set(keyword_index[keyword_index == word2].index.values)
        return len(indx1.intersection(indx2))

    # Make list of all unique words
    unique_words = df['Keywords'].apply(pd.Series).stack().reset_index(drop=True).unique()

    # Make an empty edgelist dataframe of our words
    edges = pd.DataFrame(data=list(combinations(unique_words, 2)),
                         columns=['src', 'dst'])

    edges['weight'] = edges.apply(lambda x: indexes_by_word_second(x['src'], x['dst']), axis=1)

    return edges

if __name__ == '__main__':
    assert(first(df).equals(second(df)))
    print("first ",timeit.timeit("first(df)", setup="from __main__ import first, df", number=50))
    print("second ",timeit.timeit("second(df)", setup="from __main__ import second, df", number=50))

Produces

first  1.8623420829999997
second  1.7135651139999997
Rich Andrews
  • 1,590
  • 8
  • 12