Inefficient preprocessing in Python

Question

I've to do preprocessing on some .csv file. These .csv file are matrix of audio feature from TIMIT dataset. Basically they are matrix of #samples * 123 features. I would like to do a sliding window over the samples.

I wrote this class:

import glob
import pandas as pd
import numpy as np
from math import floor
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import time
import datetime
import progressbar

class MyDataGenerator:

    def __init__(self, path):
        self.__path = path
        self.__path_train = path[0]
        self.__path_test = path[1]
        self.__path_validation = path[2]

    def generate_overlapping_chunks(self, timesteps, compact = True):
        print("reading train:")
        data_train = self.generate_data_frame(self.__path_train)
        print("reading test:")
        data_test = self.generate_data_frame(self.__path_test)
        print("reading validation:")
        data_validation = self.generate_data_frame(self.__path_validation)
        if compact:
            data_train = self.compact_class(data_train)
            data_test = self.compact_class(data_test)
            data_validation = self.compact_class(data_validation)
        train_n, test_n, validation_n = self.min_max_scale_skl(data_train, data_test, data_validation)
        print("train:")
        train_data, train_label = self.generate_chunks(data_train, train_n, timesteps)
        print("test:")
        test_data, test_label = self.generate_chunks(data_test, test_n, timesteps)
        print("validation:")
        validation_data, validation_label = self.generate_chunks(data_validation, validation_n, timesteps)
        train_label, test_label, validation_label = self.encode_label(train_label, test_label, validation_label)
        return train_data, train_label, test_data, test_label, validation_data, validation_label

    def compact_class(self, data_file):
        data_file.loc[data_file['phoneme'] == 'ux', 'phoneme'] = 'uw'
        data_file.loc[data_file['phoneme'] == 'axr', 'phoneme'] = 'er'
        data_file.loc[data_file['phoneme'] == 'em', 'phoneme'] = 'm'
        data_file.loc[data_file['phoneme'] == 'nx', 'phoneme'] = 'n'
        data_file.loc[data_file['phoneme'] == 'eng', 'phoneme'] = 'ng'
        data_file.loc[data_file['phoneme'] == 'hv', 'phoneme'] = 'hh'
        data_file.loc[data_file['phoneme'] == 'h#', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'pau', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'pcl', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'tcl', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'kcl', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'bcl', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'dcl', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'gcl', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'epi', 'phoneme'] = 'sil'
        data_file.loc[data_file['phoneme'] == 'zh', 'phoneme'] = 'sh'
        data_file.loc[data_file['phoneme'] == 'en', 'phoneme'] = 'n'
        data_file.loc[data_file['phoneme'] == 'el', 'phoneme'] = 'l'
        data_file.loc[data_file['phoneme'] == 'ix', 'phoneme'] = 'ih'
        data_file.loc[data_file['phoneme'] == 'ax', 'phoneme'] = 'ah'
        data_file.loc[data_file['phoneme'] == 'ax-h', 'phoneme'] = 'ah'
        data_file.loc[data_file['phoneme'] == 'ao', 'phoneme'] = 'aa'
        return data_file

    def generate_data_frame(self, path):
        data = pd.DataFrame()
        tot = len(glob.glob(path))
        bar = progressbar.ProgressBar(maxval=tot, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        i = 0
        bar.start()
        for file_name in glob.iglob(path):
            data_file = pd.read_csv(file_name)
            data = pd.concat((data, data_file))
            i = i+1
            bar.update(i)
        bar.finish()
        data = data.rename(columns={'Unnamed: 0': 'frame'}) 
        return data

    def min_max_scale_skl(self, train, test, validation):
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler = scaler.fit(np.concatenate((train.iloc[:, 1:124], test.iloc[:, 1:124], validation.iloc[:, 1:124])))
        return scaler.transform(train.iloc[:, 1:124]), scaler.transform(test.iloc[:, 1:124]), scaler.transform(validation.iloc[:, 1:124])

    def generate_chunks(self, data, data_norm, timesteps):
        label = np.empty(0)
        data_np = np.empty((1, timesteps, 123))
        b = range(timesteps, data.shape[0]+1) 
        bar = progressbar.ProgressBar(maxval=data.shape[0]-timesteps, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        bar.start()
        for i in range(0, data.shape[0]-timesteps+1):
            c = ((data_norm[i:b[i]])).reshape(1, timesteps, (124-1))
            data_np = np.concatenate((data_np, c))
            label = np.concatenate((label, [data.iloc[i+floor(timesteps/2)]['phoneme']]))
            bar.update(i)
        bar.finish()
        return data_np[1:], label

    def encode_label(self, train, test, val):
        encoder = LabelEncoder()
        encoder.fit(
            np.concatenate(
                (train, np.concatenate((test, val)))
                )
            )
        train_encoded_labels = encoder.transform(train)
        test_encoded_labels = encoder.transform(test)
        val_encoded_labels = encoder.transform(val)
        return to_categorical(train_encoded_labels), to_categorical(test_encoded_labels), to_categorical(val_encoded_labels)

I noticed that

generate_chunks(self, data, data_norm, timesteps)

is very slow. Last execution took me more than 40 hours on an Intel Xeon E5-1620 v3. I use Python 3.6.8 installed with Anaconda. Any idea to boost this crappy code?

score 1 · Answer 1 · answered Jul 06 '19 at 16:08

Try to divide your records into smaller chunks and then process them in parallel. Here is a great discussion with easy examples: How to use threading in Python?

Also, there is an option to use cython (Python + C (in big shortcut)). It can be helpful for huge loops etc.

score 1 · Accepted Answer · answered Jul 06 '19 at 17:23

data_np = np.concatenate((data_np, c))
label = np.concatenate((label, [data.iloc[i+floor(timesteps/2)]['phoneme']]))

These are expensive operations and you do them a lot.

def generate_chunks(self, data, data_norm, timesteps):
    label = []
    data_np = []
    b = range(timesteps, data.shape[0]+1) 

    for i in range(0, data.shape[0] - timesteps + 1):
        data_np.append(data_norm[i:b[i]].reshape(1, timesteps, (124 - 1)))
        label.append(data.iloc[i + floor(timesteps / 2)]['phoneme'])

    data_np = np.concatenate(data_np)
    labels = np.concatenate(label)

    return data_np, labels

Something like this will be at least an order of magnitude faster with no change to memory usage. Other improvements will help as well (and you should consider profiling your code if you're interested in improving it), but this will be the big one.

Inefficient preprocessing in Python

2 Answers2