In looking for an optimized solution I ended up doing a modified approach to the answer provided by tdy and the one here Combine pandas string columns with missing values
I ended up turning this code into a function as I had a need to use it repeatedly.
Hope this helps someone else who comes across the same problem:
# functions
def concat_df_cols(df, source_cols, target_col, sep=" ", na_rep=""):
""" Add separator and replace NaN to empty space, while handling columns of different types.
Args:
df (dataframe): The dataframe to be modified
source_cols (list): The columns to concatenate.
target_col (str): The destination column for the concatenated source columns.
sep (str): The separator with which to concatenate the columns.
na_rep (str): The default replacement value for NaN values.
# Note, anything other than the default empty string will result in the
na_rep persisting after the concatentation.
Returns:
dataframe: The modified dataframe
"""
df = df.replace(np.nan, na_rep) # Replace nans with ''
df[source_cols] = df[source_cols].astype(str) # Convert cols to str to permit concatenation
df = df.replace(r'^\s*$', np.nan, limit=1, regex=True) # Put NaNs back
# Concat source_cols into target_col
df[target_col] = df[source_cols].apply(
lambda x: None if x.isnull().all() else sep.join(x.dropna()), axis=1)
return df
def concat_df_cols_fast(df, sep=" ", na_rep=""):
""" Add separator and replace NaN to empty space, while handling columns of different types.
Args:
df (dataframe): The dataframe to be modified, with only source_cols included**.
sep (str): The separator with which to concatenate the columns.
na_rep (str): The default replacement value for NaN values.
# Note, anything other than the default empty string will result in the
na_rep persisting after the concatentation.
Returns:
dataframe: The modified dataframe
"""
df = df.applymap(str) # Convert cols to str to permit concatenation
# Add separator and replace NaN to empty space
# Convert to lists
arr = df.fillna(na_rep).values.tolist()
# Replace empty spaces to NaN using list comprehension
s = pd.Series([sep.join(x).strip(sep) for x in arr if x]).replace('^$', np.nan, regex=True)
# Replace NaN to None
s = s.where(s.notnull(), None)
return s
# setup
df = pd.DataFrame([['day',np.nan,np.nan,np.nan],['day',np.nan,'tablet(s)',np.nan],['day',2,np.nan,np.nan],['day',1,'tablet(s)',np.nan],['day',2,'tablet(s)',np.nan]],columns=['dosagedurationunit','dosagequantityvalue','dosagequantityunit','quantityvalue'])
# Make the df 50000 rows
df = pd.concat([df]*10000).reset_index(drop=True)
##### Approach 1 #####
# This approach took on average 0.27553908449 seconds
df['NORMALIZED_DOSAGE'] = concat_df_cols_fast(df[['dosagequantityvalue', 'dosagequantityunit']], )
##### Approach 2 #####
# This approach took on average 5.92792463605 seconds
# replace nans with ''
df = df.replace(np.nan, '')
# concat value + unit
df['NORMALIZED_DOSAGE'] = df.dosagequantityvalue.astype(str) + ' ' + df.dosagequantityunit.astype(str)
# put nans back
df = df.replace(r'^\s*$', np.nan, limit=1, regex=True)
##### Approach 3 #####
# This approach took on average 27.7539046249 seconds
df = concat_df_cols(df, source_cols=['dosagequantityvalue', 'dosagequantityunit'],
target_col='NORMALIZED_DOSAGE')
UPDATE: Refactored functions:
def concat_df_cols_new(df, sep=" ", na_rep=""):
""" Add separator and replace NaN to empty space, while handling columns of different types.
Args:
df (dataframe): The dataframe to be modified, with only source_cols included**.
source_cols (list): The columns to concatenate.
sep (str): The separator with which to concatenate the columns.
na_rep (str): The default replacement value for NaN values.
# Note, anything other than the default empty string will result in the
na_rep persisting after the concatentation.
Returns:
dataframe: The modified dataframe
"""
df = df.replace(np.nan, sep, inplace=False)
df = df.applymap(str) # Convert cols to str to permit concatenation
# Add separator and replace NaN to empty space
# Convert to lists
arr = df.values.tolist()
# Replace empty spaces to NaN using list comprehension
df = pd.Series([sep.join(x).strip(sep) for x in arr]).replace('^$', np.nan, regex=True)
return df
def replace_concat_replace_new(df):
df = df.replace(np.nan, '')
s = df.dosagequantityvalue.astype(str) + ' ' + df.dosagequantityunit.astype(str)
s = s.replace(r'^\s*$', np.nan, regex=True)
s = s.replace(r'\s*$', '', regex=True) # Trim trailing whitespace
s = s.replace(r'^\s*', '', regex=True) # Trim leading whitespace
return s
df['NORMALIZED_DOSAGE_CONCAT'] = concat_df_cols_new(df[['dosagequantityvalue', 'dosagequantityunit']])
# 131.98 ms ± 2.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
df['NORMALIZED_DOSAGE'] = replace_concat_replace_new(df[['dosagequantityvalue', 'dosagequantityunit']])
# 395.97 ms ± 28.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Ultimately, I'll go with concat_df_cols_new simply because I can use this function on dataframes with different column names and the runtime is on currently ~3x better. Unless there's a solution for those too..