0

Ultimately i want each list within the nested lists in column names_text to form a unique row in the dataframe.

Code:

data = []
        .
        .
        .
        text = ' '.join(map(str, raw_text))
        
        ### create list of names
        m = re.search(r'long_regex', text, re.I | re.S)
        if m:
            mps0 = re.sub(r'(\([^()]*\))|\s*\d+', r'\1', m.group(1))
            mps1 = re.split('\) |   ', mps0)
            flat_list = [item for sublist in mps1 for item in sublist]         
            flat_list[:] = [x for x in flat_list if x] # remove empty strings
            names = []    
            for n in flat_list:
                names.extend(get_names_method(n))
        
        ### split text into c_list
        text1 = re.compile(r'very_long_regex')
        apply2 = text1.search(text)
        c_list = []
        if apply2:
            c_list.append(apply2.group(5))
        elif apply2 is None:
            text4 = re.compile(r'very_long_regex')
            apply4 = apply4.search(text4)
            elif apply4:
                c_list.append(apply4.group(4))
            else:
                c_list.append(None)
        
        ### split c_list by names
        regex_string = "|".join(regex_string_method(name) for name in names)
        group_count = regex_string.count("(") + 1
        fragments = re.split(f"({regex_string})", c_list)
        if fragments:
            if not fragments[0] in names: 
                fragments = fragments[1:]
            if c_list is not None:
                result = [[name, c_list.rstrip()] for name, c_list in zip(
                    fragments[::group_count+1],
                    fragments[group_count::group_count+1]
                ) if c_list is not None]

    data.append([text, names, c_list, result])
df = pd.DataFrame(data, columns =['col1', 'col2', 'col3', 'names_text'])

my df has a column names_text which is a list of lists. I want transform the lists within names_text into rows. I dont want to use .explode(). How do i need to change this for loop in order to not have to explode on the nested lists column names_text?

Representative data:

d = [['aa',  None, 'xx', [['ps', 'ps1'], ['ps22', 'ps2'], ['ps33', 'ps3']]],
     [None, 'tt', 'jjjj', [['pppp', 'pppp1'], ['pppp22', 'pppp2']]],
     [None, 'uu', None, [['oo', 'oo1'], ['oo', 'oo2'], ['oo45', 'oo2'], ['oo4', 'oo3']]],

c = ['col1','col2','col3','names_text']
df = pd.DataFrame(d,columns=c)

print(df)

   col1  col2  col3                                       names_text
0    aa  None    xx            [[ps, ps1], [ps22, ps2], [ps33, ps3]]
1  None    tt  jjjj                 [[pppp, pppp1], [pppp22, pppp2]]
2  None    uu  None  [[oo, oo1], [oo, oo2], [oo45, oo2], [oo4, oo3]]

desired output:

d = [['aa',  None, 'xx', ['ps', 'ps1']],
     ['aa',  None, 'xx', ['ps22', 'ps2']],
     ['aa',  None, 'xx', ['ps33', 'ps3']],
     [None, 'tt', 'jjjj', ['pppp', 'pppp1']],
     [None, 'tt', 'jjjj', ['pppp22', 'pppp2']],
     [None, 'uu', None, ['oo', 'oo1']],
     [None, 'uu', None, ['oo', 'oo2']],
     [None, 'uu', None, ['oo45', 'oo2']],
     [None, 'uu', None, ['oo4', 'oo3']]]

c = ['col1','col2','col3','names_text']
df = pd.DataFrame(d,columns=c)

print(df)

   col1  col2  col3       names_text
0    aa  None    xx        [ps, ps1]
1    aa  None    xx      [ps22, ps2]
2    aa  None    xx      [ps33, ps3]
3  None    tt  jjjj    [pppp, pppp1]
4  None    tt  jjjj  [pppp22, pppp2]
5  None    uu  None        [oo, oo1]
6  None    uu  None        [oo, oo2]
7  None    uu  None      [oo45, oo2]
8  None    uu  None       [oo4, oo3]
id345678
  • 97
  • 1
  • 3
  • 21

0 Answers0