-2
files_data = []

for file in files:
    with pdfplumber.open(file) as pdf:
        lines = []
        for page in pdf.pages:
            text = page.extract_text()
            for line in text.split('\n'):
                lines.append(line)
        files_data.append(lines)
z = '\n'.join(files_data[0])
z1 = '\n'.join(files_data[1])

list1 = []
list1.extend((z,z1))

print(list1)


BTW_re = re.compile(r'([A-Za-z]{2}\d{9}[A-Za-z]\d{2})')
KVK_re = re.compile(r'(KVK-nummer: \d+)')
IBAN_re = re.compile(r'([A-Z]{2}\d{2}.[A-Z]{4}(.*))')
BIC_re = re.compile(r'([A-Z]{6}\d[A-Z])')
Factuur_re = re.compile(r'((?<=Factuur:)\s([\S]+))')
Factuurdatum_re = re.compile(r'\d{2}/\d{2}/\d{4}')
Vervaldatum_re = re.compile(r'((?<=Vervaldatum:)\s.+)')
Betreft_re = re.compile(r'(Betreft: (.*))')
partyname_re = re.compile(r'((^(.*)$))')
partyname_re = re.compile(r'^.*$', re.M)
Adressline_re = re.compile(r'^(?:(.*)\r?\n){2}')
postalcode_re = re.compile(r'(\d{4}[A-Z]{2})')
City_re = re.compile(r'((?<=\d{4}[A-Z]{2})[\s\S][A-Z]{1}[A-z]*)')

for item in list1:
    print(partyname_re.search(item).group(0))

For instance, for the print(statement), there would be two values stored in seperate lists.

"Rompslomp.nl B.V." & "Rompslomp.nl B.V." (see picture attached)

I don't know how to hack this.. Can someone help me?

Jupyter Notebook

Max FH
  • 59
  • 6
  • So just to be clear, if there are 2 instances of the word Rompslomp.nl B.V in your text, you need it in a format like this ->[ ["Rompslomp.nl B.V"] ["Rompslomp.nl B.V"] ]. Did I understand it correct? – BennyHawk Nov 26 '20 at 16:53
  • Yes Benny Hawk! – Max FH Nov 26 '20 at 17:18

1 Answers1

-1

From what I have understood, I believe this code should help you

regex = re.compile(r"<some regex here>",re.MULTILINE)

test_str = "<some text here>"

matches = regex.finditer(test_str)

for matchNum, match in enumerate(matches, start=1):
    # The entire matched regex

    print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))

    for groupNum in range(0, len(match.groups())):
        # Any capture groups in your regex
        groupNum = groupNum + 1
    
        print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))

Here is an amazing website that can help you generate python code for you too. The above code is picked from the same website.

Regex101

BennyHawk
  • 172
  • 2
  • 11
  • Thanks for your comment BennyHawk, however I can't see your code in RegEx101 which makes it hard to be understood – Max FH Nov 26 '20 at 17:45