Can someone please help me understand how regex works in line 34 which finds out the max consecutive STRs (sub-strings) in a string s
Specific line in code
str.update({k: max([len(x) // len(f'{k}') for x in re.findall(rf'((?:{k})+)', s)])})
I found the above syntax at Count max consecutive RE groups in a string
My complete code
from sys import argv, exit
import csv
import collections
import re
def main():
if len(argv) != 3:
print("Usage: python dna.py data.csv sequence.txt")
exit(1)
# store dna string to be matched in string 's'
sequence = open(argv[2], 'r')
s = sequence.read()
sequence.close()
# read the dna sequence of people from govt database
columns = {}
csvfile = open(argv[1], newline='')
reader = csv.DictReader(csvfile, delimiter=',')
# store column names in a list
columns = reader.fieldnames
database = list(reader)
# create a dna dictionary key:value = column name : count of max consecutive column names in s (default value is 0)
str = {}
for i in range(1, len(columns), 1):
str.update({columns[i]: "0"})
# find out the max consecutive column name in s and then update the value in dictionary key:value = column name : count of max consecutive column names in s
for k in str:
if(re.findall(rf'((?:{k})+)', s) == []):
str.update({k: '0'})
else:
str.update({k: max([len(x) // len(f'{k}') for x in re.findall(rf'((?:{k})+)', s)])})
# match dictionary key:value = column name : count of max consecutive column names in s WITH column names' values in database
# if match is made then print the name of the person from database whose dna has been matched
for i in range(len(database)):
match = True
for j in range(1, len(columns), 1):
if(int(database[i][f'{columns[j]}']) != int(str[f'{columns[j]}'])):
match = False
if(match):
name = database[i]['name']
break
else:
name = 'No match'
print(name)
main()