I bet there's some third party Python package that does exactly what you're looking for. I recommend you try googling it before using my suggestion.
Anyway, the following code might work for most cases:
from __future__ import annotations
from typing import Iterable
import pandas as pd
import numpy as np
# Setting max_colwidth to None, to force pandas not to truncate the "text" column.
pd.set_option('display.max_colwidth', None)
def process_review(
review_text: str,
split_len: int = 40,
separators: list[str] | None = None,
):
"""Split a review into multiple sentences.
Parameters
----------
review_text : str
The review text.
split_len : int, default=40
The maximum length of each sentence
separators : list[str] | None, optional
The separators to split the review, by default None
Returns
-------
list[str]
The list of sentences.
Raises
------
ValueError
If `separators` is not a string, or iterable.
"""
if separators is None:
separators = ['.', ';', '?', '!', '...', '\n']
elif isinstance(separators, str):
separators = [separators]
elif not isinstance(separators, Iterable):
raise ValueError(f'`separators` {separators} not Iterable.')
for sep in separators:
review_text = review_text.replace(sep, '.')
# This is to account for strings like: "!?", "!!", "??", etc.
# Remove this while/loop if you don't want to modify these types of punctuation
# to single full stop punctuation (".").
while '..' in review_text:
review_text = review_text.replace('..', '.')
review_len = len(review_text)
if review_len <= split_len:
return review_text
lines = []
line = ''
for sentence in (s.strip() + '.' for s in review_text.split('.')[:-1]):
# can't fit on that line => start new one
if len(line) + len(sentence) + 1 >= split_len:
lines.append(line)
line = sentence
# can fit on => add a space then this sentence
else:
line += ' ' if line != '' else '' + sentence
lines.append(line)
return lines
def extend_iloc(df: pd.DataFrame, col_target: str = 'text') -> pd.DataFrame:
"""Replicate rows of a DataFrame and add a flattened column of lists.
Parameters
----------
df : pd.DataFrame
DataFrame with a column of lists.
col_target : str, optional
Name of the column of lists, by default 'text'.
Returns
-------
pd.DataFrame
DataFrame with replicated rows and a flattened column of lists.
"""
# Flatten columns of lists
col_flat = [item for sublist in df[col_target] for item in sublist]
# Row numbers to repeat
lens = df[col_target].apply(len)
vals = range(df.shape[0])
ilocations = np.repeat(vals, lens)
# Replicate rows and add flattened column of lists
cols = [i for i, c in enumerate(df.columns) if c != col_target]
new_df = df.iloc[ilocations, cols].copy()
new_df[col_target] = col_flat
return new_df
# Creating example dataframe
original_reviews = pd.DataFrame(
{'review id': ['33435', '33436'],
'text': ['This is such an amazing product! I would defninitely buy it again '
'and again. When first time I bought it, I decided that I will do it all my life.',
'This is such an amazing product!!! I would defninitely buy it again '
'and again. When first time I bought it, I decided that I will do it all my life.']})
# Splitting 'text' column rows into list of smaller sentences.
original_reviews['text'] = original_reviews['text'].apply(process_review)
# Note: You can control the maximum lenght of each sentence, by setting a custom value
# for the `split_len` parameter. By default it's set to 40.
# original_reviews['text'] = original_reviews['text'].apply(process_review, split_len=60)
# Transforming each value from lists into separate rows.
print(extend_iloc(original_reviews))
# Prints:
#
# review id text
# 0 33435 This is such an amazing product.
# 0 33435 I would defninitely buy it again and again.
# 0 33435 When first time I bought it, I decided that I will do it all my life.
# 1 33436 This is such an amazing product.
# 1 33436 I would defninitely buy it again and again.
# 1 33436 When first time I bought it, I decided that I will do it all my life.
I'm sure this implementation won't work for all situations. Specially for large sentences, without many breaks. Unfortunately, there are too many corner-cases when it comes to a use-case like the one you're trying yo solve.