32

I'm looking for a simple way of parsing complex text files into a pandas DataFrame. Below is a sample file, what I want the result to look like after parsing, and my current method.

Is there any way to make it more concise/faster/more pythonic/more readable?

I've also put this question on Code Review.

I eventually wrote a blog article to explain this to beginners.

Here is a sample file:

Sample text

A selection of students from Riverdale High and Hogwarts took part in a quiz. This is a record of their scores.

School = Riverdale High
Grade = 1
Student number, Name
0, Phoebe
1, Rachel

Student number, Score
0, 3
1, 7

Grade = 2
Student number, Name
0, Angela
1, Tristan
2, Aurora

Student number, Score
0, 6
1, 3
2, 9

School = Hogwarts
Grade = 1
Student number, Name
0, Ginny
1, Luna

Student number, Score
0, 8
1, 7

Grade = 2
Student number, Name
0, Harry
1, Hermione

Student number, Score
0, 5
1, 10

Grade = 3
Student number, Name
0, Fred
1, George

Student number, Score
0, 0
1, 0

Here is what I want the result to look like after parsing:

                                         Name  Score
School         Grade Student number                 
Hogwarts       1     0                  Ginny      8
                     1                   Luna      7
               2     0                  Harry      5
                     1               Hermione     10
               3     0                   Fred      0
                     1                 George      0
Riverdale High 1     0                 Phoebe      3
                     1                 Rachel      7
               2     0                 Angela      6
                     1                Tristan      3
                     2                 Aurora      9

Here is how I currently parse it:

import re
import pandas as pd


def parse(filepath):
    """
    Parse text at given filepath

    Parameters
    ----------
    filepath : str
        Filepath for file to be parsed

    Returns
    -------
    data : pd.DataFrame
        Parsed data

    """

    data = []
    with open(filepath, 'r') as file:
        line = file.readline()
        while line:
            reg_match = _RegExLib(line)

            if reg_match.school:
                school = reg_match.school.group(1)

            if reg_match.grade:
                grade = reg_match.grade.group(1)
                grade = int(grade)

            if reg_match.name_score:
                value_type = reg_match.name_score.group(1)
                line = file.readline()
                while line.strip():
                    number, value = line.strip().split(',')
                    value = value.strip()
                    dict_of_data = {
                        'School': school,
                        'Grade': grade,
                        'Student number': number,
                        value_type: value
                    }
                    data.append(dict_of_data)
                    line = file.readline()

            line = file.readline()

        data = pd.DataFrame(data)
        data.set_index(['School', 'Grade', 'Student number'], inplace=True)
        # consolidate df to remove nans
        data = data.groupby(level=data.index.names).first()
        # upgrade Score from float to integer
        data = data.apply(pd.to_numeric, errors='ignore')
    return data


class _RegExLib:
    """Set up regular expressions"""
    # use https://regexper.com to visualise these if required
    _reg_school = re.compile('School = (.*)\n')
    _reg_grade = re.compile('Grade = (.*)\n')
    _reg_name_score = re.compile('(Name|Score)')

    def __init__(self, line):
        # check whether line has a positive match with all of the regular expressions
        self.school = self._reg_school.match(line)
        self.grade = self._reg_grade.match(line)
        self.name_score = self._reg_name_score.search(line)


if __name__ == '__main__':
    filepath = 'sample.txt'
    data = parse(filepath)
    print(data)
bluprince13
  • 4,607
  • 12
  • 44
  • 91
  • 3
    When parsing text, consider these approaches in order of increasing learning curves: `str` methods, `re`/`regex` modules, parsing library (e.g. parsimonious, PLY, pyparsing etc). – pylang Jan 10 '18 at 23:02

5 Answers5

39

Update 2019 (PEG parser):

This answer has received quite some attention so I felt to add another possibility, namely a parsing option. Here we could use a PEG parser instead (e.g. parsimonious) in combination with a NodeVisitor class:

from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import pandas as pd
grammar = Grammar(
    r"""
    schools         = (school_block / ws)+

    school_block    = school_header ws grade_block+ 
    grade_block     = grade_header ws name_header ws (number_name)+ ws score_header ws (number_score)+ ws? 

    school_header   = ~"^School = (.*)"m
    grade_header    = ~"^Grade = (\d+)"m
    name_header     = "Student number, Name"
    score_header    = "Student number, Score"

    number_name     = index comma name ws
    number_score    = index comma score ws

    comma           = ws? "," ws?

    index           = number+
    score           = number+

    number          = ~"\d+"
    name            = ~"[A-Z]\w+"
    ws              = ~"\s*"
    """
)

tree = grammar.parse(data)

class SchoolVisitor(NodeVisitor):
    output, names = ([], [])
    current_school, current_grade = None, None

    def _getName(self, idx):
        for index, name in self.names:
            if index == idx:
                return name

    def generic_visit(self, node, visited_children):
        return node.text or visited_children

    def visit_school_header(self, node, children):
        self.current_school = node.match.group(1)

    def visit_grade_header(self, node, children):
        self.current_grade = node.match.group(1)
        self.names = []

    def visit_number_name(self, node, children):
        index, name = None, None
        for child in node.children:
            if child.expr.name == 'name':
                name = child.text
            elif child.expr.name == 'index':
                index = child.text

        self.names.append((index, name))

    def visit_number_score(self, node, children):
        index, score = None, None
        for child in node.children:
            if child.expr.name == 'index':
                index = child.text
            elif child.expr.name == 'score':
                score = child.text

        name = self._getName(index)

        # build the entire entry
        entry = (self.current_school, self.current_grade, index, name, score)
        self.output.append(entry)

sv = SchoolVisitor()
sv.visit(tree)

df = pd.DataFrame.from_records(sv.output, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)

Regex option (original answer)

Well then, watching Lord of the Rings the xth time, I had to bridge some time to the very finale:


Broken down, the idea is to split the problem up into several smaller problems:
  1. Separate each school
  2. ... each grade
  3. ... student and scores
  4. ... bind them together in a dataframe afterwards


The school part (see a demo on regex101.com)
^
School\s*=\s*(?P<school_name>.+)
(?P<school_content>[\s\S]+?)
(?=^School|\Z)


The grade part (another demo on regex101.com)
^
Grade\s*=\s*(?P<grade>.+)
(?P<students>[\s\S]+?)
(?=^Grade|\Z)


The student/score part (last demo on regex101.com):
^
Student\ number,\ Name[\n\r]
(?P<student_names>(?:^\d+.+[\n\r])+)
\s*
^
Student\ number,\ Score[\n\r]
(?P<student_scores>(?:^\d+.+[\n\r])+)

The rest is a generator expression which is then fed into the DataFrame constructor (along with the column names).


The code:
import pandas as pd, re

rx_school = re.compile(r'''
    ^
    School\s*=\s*(?P<school_name>.+)
    (?P<school_content>[\s\S]+?)
    (?=^School|\Z)
''', re.MULTILINE | re.VERBOSE)

rx_grade = re.compile(r'''
    ^
    Grade\s*=\s*(?P<grade>.+)
    (?P<students>[\s\S]+?)
    (?=^Grade|\Z)
''', re.MULTILINE | re.VERBOSE)

rx_student_score = re.compile(r'''
    ^
    Student\ number,\ Name[\n\r]
    (?P<student_names>(?:^\d+.+[\n\r])+)
    \s*
    ^
    Student\ number,\ Score[\n\r]
    (?P<student_scores>(?:^\d+.+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)


result = ((school.group('school_name'), grade.group('grade'), student_number, name, score)
    for school in rx_school.finditer(string)
    for grade in rx_grade.finditer(school.group('school_content'))
    for student_score in rx_student_score.finditer(grade.group('students'))
    for student in zip(student_score.group('student_names')[:-1].split("\n"), student_score.group('student_scores')[:-1].split("\n"))
    for student_number in [student[0].split(", ")[0]]
    for name in [student[0].split(", ")[1]]
    for score in [student[1].split(", ")[1]]
)

df = pd.DataFrame(result, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)


Condensed:
rx_school = re.compile(r'^School\s*=\s*(?P<school_name>.+)(?P<school_content>[\s\S]+?)(?=^School|\Z)', re.MULTILINE)
rx_grade = re.compile(r'^Grade\s*=\s*(?P<grade>.+)(?P<students>[\s\S]+?)(?=^Grade|\Z)', re.MULTILINE)
rx_student_score = re.compile(r'^Student number, Name[\n\r](?P<student_names>(?:^\d+.+[\n\r])+)\s*^Student number, Score[\n\r](?P<student_scores>(?:^\d+.+[\n\r])+)', re.MULTILINE)


This yields
            School Grade Student number      Name Score
0   Riverdale High     1              0    Phoebe     3
1   Riverdale High     1              1    Rachel     7
2   Riverdale High     2              0    Angela     6
3   Riverdale High     2              1   Tristan     3
4   Riverdale High     2              2    Aurora     9
5         Hogwarts     1              0     Ginny     8
6         Hogwarts     1              1      Luna     7
7         Hogwarts     2              0     Harry     5
8         Hogwarts     2              1  Hermione    10
9         Hogwarts     3              0      Fred     0
10        Hogwarts     3              1    George     0


As for timing, this is the result running it a ten thousand times:
import timeit
print(timeit.timeit(makedf, number=10**4))
# 11.918397722000009 s

Jan
  • 42,290
  • 8
  • 54
  • 79
  • 3
    oh wow! This is amazing. I'll hopefully be able to spit out code like this myself one day. However, the reason I asked my question was so I could come up with an easy to understand way of parsing text files that I could teach to a complete beginner. I think your code is insanely concise, but perhaps not something a beginner would be able to put together themselves easily. Thanks for sharing though! I'll be studying this to further my understanding. :) – bluprince13 Dec 26 '17 at 22:19
  • 1
    @bluprince13: No, that is definitely not sth. to teach a beginner :) – Jan Dec 26 '17 at 22:24
  • 2
    @bluprince13: You could really consider https://codereview.stackexchange.com/ then. – Jan Dec 26 '17 at 22:26
  • 2
    @bluprince13: compared to yours, this is the simpler, easier expandable, and more maintainable code. It is not complicated just "for the sake of it". Though I agree that it's not something to start with But I imagine Jan started out with simpler models first, where yours is already (necessarily) complicated. As an example of what is *possible*, this stands out, But I am sure the fellows at CodeReview can help you further. – Jongware Dec 26 '17 at 23:09
  • 2
    Good answer. :) – Wiktor Stribiżew May 11 '19 at 17:57
8

here is my suggestion using split and pd.concat ("txt" stands for a copy of the original text in the question), basicly the idea is to split by the group words and then concat into data frames, the most inner parsing takes advantage of the fact that the names and grades are in a csv like format. here goes:

import pandas as pd
from io import StringIO

schools = txt.lower().split('school = ')
schools_dfs = []
for school in schools[1:]:
    grades = school.split('grade = ') 
    grades_dfs = []
    for grade in grades[1:]:
        features = grade.split('student number,')
        feature_dfs = []
        for feature in features[1:]:
            feature_dfs.append(pd.read_csv(StringIO(feature)))
        feature_df = pd.concat(feature_dfs, axis=1)
        feature_df['grade'] = features[0].replace('\n','')
        grades_dfs.append(feature_df)
    grades_df = pd.concat(grades_dfs)
    grades_df['school'] = grades[0].replace('\n','')
    schools_dfs.append(grades_df)
schools_df = pd.concat(schools_dfs)

schools_df.set_index(['school', 'grade'])

enter image description here

Ezer K
  • 3,637
  • 3
  • 18
  • 34
7

I would suggest using a parser combinator library like parsy. Compared to using regexes, the result will not be as concise, but it will be much more readable and robust, while still being relatively light-weight.

Parsing is in general quite a hard task, and an approach that is good for people at beginner level for general programming might be hard to find.

EDIT 2022: Full example code, using modern Parsy, that parses your supplied example and produces the same output.

It separates into 3 phases:

  • parsing into some basic data structures
  • transforming slightly to match up student names/numbers/scores
  • converting to DataFrame

This separation means fewer hacks at the DataFrame level are needed.

import pandas as pd
from parsy import string, regex, seq
from dataclasses import dataclass


@dataclass
class Student:
    name: str
    number: int


@dataclass
class Score:
    score: int
    number: int


@dataclass
class StudentWithScore:
    name: str
    number: int
    score: int


@dataclass
class Grade:
    grade: int
    students: list[Student]
    scores: list[Score]

    @property
    def students_with_scores(self) -> list[StudentWithScore]:
        names = {st.number: st.name for st in self.students}
        return [StudentWithScore(names[score.number], score.number, score.score) for score in self.scores]


@dataclass
class School:
    name: str
    grades: list[Grade]


integer = regex(r"\d+").map(int)
student_number = integer
score = integer
student_name = regex(r"[^\n]+")
student_def = seq(
    number=student_number << string(", "),
    name=student_name << string("\n"),
).combine_dict(Student)

student_def_list = string("Student number, Name\n") >> student_def.many()
score_def = seq(
    number=student_number << string(", "),
    score=score << string("\n"),
).combine_dict(Score)
score_def_list = string("Student number, Score\n") >> score_def.many()
grade_value = integer
grade_def = string("Grade = ") >> grade_value << string("\n")
school_grade = seq(
    grade=grade_def,
    students=student_def_list << regex(r"\n*"),
    scores=score_def_list << regex(r"\n*"),
).combine_dict(Grade)

school_name = regex(r"[^\n]+")
school_def = string("School = ") >> school_name << string("\n")
school = seq(
    name=school_def,
    grades=school_grade.many(),
).combine_dict(School)


def parse(text: str) -> list[School]:
    return school.many().parse(text)


def schools_to_dataframe(schools: list[School]) -> pd.DataFrame:
    data_dicts = [
        {"School": school.name, "Grade": g.grade, "Student number": s.number, "Name": s.name, "Score": s.score}
        for school in schools
        for g in school.grades
        for s in g.students_with_scores
    ]
    data = pd.DataFrame(data_dicts)
    data.set_index(["School", "Grade", "Student number"], inplace=True)
    return data


if __name__ == "__main__":
    filepath = "sample.txt"
    text = open(filepath).read()
    start = text.index("School =")
    schools = parse(text[start:])
    data = schools_to_dataframe(schools)
    print(data)
spookylukey
  • 6,380
  • 1
  • 31
  • 34
3

In a similar manner to your original code I define the parsing regex's

import re
import pandas as pd

parse_re = {
    'school': re.compile(r'School = (?P<school>.*)$'),
    'grade': re.compile(r'Grade = (?P<grade>\d+)'),
    'student': re.compile(r'Student number, (?P<info>\w+)'),
    'data': re.compile(r'(?P<number>\d+), (?P<value>.*)$'),
}

def parse(line):
    '''parse the line by regex search against possible line formats
       returning the id and match result of first matching regex,
       or None if no match is found'''
    return reduce(lambda (i,m),(id,rx): (i,m) if m else (id, rx.search(line)), 
                  parse_re.items(), (None,None))

then loop through the lines gathering the information about each student. Once the record is complete (when we have Score the record is complete) we append the record to a list.

A small state machine that is driven by the line by line regex matches collates each record. In particular we have to save the students in a grade by number as their Score and Name are provided separately in the input file.

results = []
with open('sample.txt') as f:
    record = {}
    for line in f:
        id, match = parse(line)

        if match is None:
            continue

        if id == 'school':
            record['School'] = match.group('school')
        elif id == 'grade':
            record['Grade'] = int(match.group('grade'))
            names = {}  # names is a number indexed dictionary of student names
        elif id == 'student':
            info = match.group('info')
        elif id == 'data':
            number = int(match.group('number'))
            value = match.group('value')
            if info == 'Name':
                names[number] = value
            elif info == 'Score':
                record['Student number'] = number
                record['Name'] = names[number]
                record['Score'] = int(value)
                results.append(record.copy())

Finally the list of records is converted to a DataFrame.

df = pd.DataFrame(results, columns=['School', 'Grade', 'Student number', 'Name', 'Score'])
print df

Outputs:

            School  Grade  Student number      Name  Score
0   Riverdale High      1               0    Phoebe      3
1   Riverdale High      1               1    Rachel      7
2   Riverdale High      2               0    Angela      6
3   Riverdale High      2               1   Tristan      3
4   Riverdale High      2               2    Aurora      9
5         Hogwarts      1               0     Ginny      8
6         Hogwarts      1               1      Luna      7
7         Hogwarts      2               0     Harry      5
8         Hogwarts      2               1  Hermione     10
9         Hogwarts      3               0      Fred      0
10        Hogwarts      3               1    George      0

Some optimizations would be to compare the most common regex's first and to explicitly skip blank lines. Building the dataframe as we go would avoid extra copies of the data but I gather that appending to a dataframe is an expensive operation.

Mike Robins
  • 1,733
  • 10
  • 14
  • This is really good, thanks. I like that you used a function instead of class to do the regex matching. – bluprince13 Jan 07 '18 at 17:31
  • @blueprince13 In practice these functions would probably end up as methods in a class. Anything that has state belongs in an object so that one can have multiple instances at the same time. – Mike Robins Jan 08 '18 at 08:36
0

This is exactly the type of problem that Pawpaw was designed for.

Pawpaw is a high performance parsing & text segmentation framework that allows you to quickly and easily build complex, pipelined parsers. Segments are automatically organized into tree graphs that can be serialized, traversed, and searched using a powerful structured query language

Here two different Pawpaw-based approaches that solve this problem. I've copied the most compact version below, along with some added tree visualization thrown in:

Code

import sys
import os.path

import regex
from pawpaw import arborform, Ito, visualization
import pandas as pd

def get_parser() -> arborform.Itorator:
    return arborform.Extract(
        regex.compile(
            r'(?<school>School = (?<name>.+?)\n'
            r'(?<grade>Grade = (?<key>\d+)\n'
            r'Student number, Name\n(?P<stu_num_names>(?:(?P<stu_num>\d+), (?P<name>.+?)\n)+)\n'
            r'Student number, Score\n(?P<stu_num_scores>(?:(?P<stu_num>\d+), (?P<score>\d+)(?:$|\n))+)(?:$|\n)'
            r')+)+',
            regex.DOTALL
    )
)

# read file
with open(os.path.join(sys.path[0], 'input.txt')) as f:
    ito = Ito(f.read(), desc='all')

# parse
parser = get_parser()
ito.children.add(*parser(ito))

# display Pawpaw tree
tree_vis = visualization.pepo.Tree()
print(tree_vis.dumps(ito))

# build pandas DataFrame
d = []
for school in ito.find_all('*[d:school]'):
    school_name = str(school.find('*[d:name]'))
    for grade in school.find_all('**[d:grade]'):
        grade_key = int(str(grade.find('*[d:key]')))
        for stu_num in grade.find_all('*[d:stu_num_names]/*[d:stu_num]'):
            stu_name = str(stu_num.find('>[d:name]'))
            stu_num = str(stu_num)
            stu_score = int(str(grade.find('*[d:stu_num_scores]/*[d:stu_num]&[s:' + stu_num + ']/>[d:score]')))
            d.append({'School': school_name, 'Grade': grade_key, 'Student number': stu_num, 'Name': stu_name, 'Score': stu_score})
data = pd.DataFrame(d)
data.set_index(['School', 'Grade', 'Student number'], inplace=True)
data = data.groupby(level=data.index.names).first()

# display pandas DataFrame
print(data)

Output

(0, 478) 'all' : 'School = Riverdale H…, Score\n0, 0\n1, 0'
├──(0, 210) 'school' : 'School = Riverdale H…, 6\n1, 3\n2, 9\n\n'
│  ├──(9, 23) 'name' : 'Riverdale High'
│  ├──(24, 109) 'grade' : 'Grade = 1\nStudent n…ore\n0, 3\n1, 7\n\n'
│  │  ├──(32, 33) 'key' : '1'
│  │  ├──(55, 75) 'stu_num_names' : '0, Phoebe\n1, Rachel\n'
│  │  │  ├──(55, 56) 'stu_num' : '0'
│  │  │  ├──(58, 64) 'name' : 'Phoebe'
│  │  │  ├──(65, 66) 'stu_num' : '1'
│  │  │  └──(68, 74) 'name' : 'Rachel'
│  │  └──(98, 108) 'stu_num_scores' : '0, 3\n1, 7\n'
│  │     ├──(98, 99) 'stu_num' : '0'
│  │     ├──(101, 102) 'score' : '3'
│  │     ├──(103, 104) 'stu_num' : '1'
│  │     └──(106, 107) 'score' : '7'
│  └──(109, 210) 'grade' : 'Grade = 2\nStudent n…, 6\n1, 3\n2, 9\n\n'
│     ├──(117, 118) 'key' : '2'
│     ├──(140, 171) 'stu_num_names' : '0, Angela\n1, Tristan\n2, Aurora\n'
│     │  ├──(140, 141) 'stu_num' : '0'
│     │  ├──(143, 149) 'name' : 'Angela'
│     │  ├──(150, 151) 'stu_num' : '1'
│     │  ├──(153, 160) 'name' : 'Tristan'
│     │  ├──(161, 162) 'stu_num' : '2'
│     │  └──(164, 170) 'name' : 'Aurora'
│     └──(194, 209) 'stu_num_scores' : '0, 6\n1, 3\n2, 9\n'
│        ├──(194, 195) 'stu_num' : '0'
│        ├──(197, 198) 'score' : '6'
│        ├──(199, 200) 'stu_num' : '1'
│        ├──(202, 203) 'score' : '3'
│        ├──(204, 205) 'stu_num' : '2'
│        └──(207, 208) 'score' : '9'
└──(210, 478) 'school' : 'School = Hogwarts\nG…, Score\n0, 0\n1, 0'
   ├──(219, 227) 'name' : 'Hogwarts'
   ├──(228, 310) 'grade' : 'Grade = 1\nStudent n…ore\n0, 8\n1, 7\n\n'
   │  ├──(236, 237) 'key' : '1'
   │  ├──(259, 276) 'stu_num_names' : '0, Ginny\n1, Luna\n'
   │  │  ├──(259, 260) 'stu_num' : '0'
   │  │  ├──(262, 267) 'name' : 'Ginny'
   │  │  ├──(268, 269) 'stu_num' : '1'
   │  │  └──(271, 275) 'name' : 'Luna'
   │  └──(299, 309) 'stu_num_scores' : '0, 8\n1, 7\n'
   │     ├──(299, 300) 'stu_num' : '0'
   │     ├──(302, 303) 'score' : '8'
   │     ├──(304, 305) 'stu_num' : '1'
   │     └──(307, 308) 'score' : '7'
   ├──(310, 397) 'grade' : 'Grade = 2\nStudent n…re\n0, 5\n1, 10\n\n'
   │  ├──(318, 319) 'key' : '2'
   │  ├──(341, 362) 'stu_num_names' : '0, Harry\n1, Hermione\n'
   │  │  ├──(341, 342) 'stu_num' : '0'
   │  │  ├──(344, 349) 'name' : 'Harry'
   │  │  ├──(350, 351) 'stu_num' : '1'
   │  │  └──(353, 361) 'name' : 'Hermione'
   │  └──(385, 396) 'stu_num_scores' : '0, 5\n1, 10\n'
   │     ├──(385, 386) 'stu_num' : '0'
   │     ├──(388, 389) 'score' : '5'
   │     ├──(390, 391) 'stu_num' : '1'
   │     └──(393, 395) 'score' : '10'
   └──(397, 478) 'grade' : 'Grade = 3\nStudent n…, Score\n0, 0\n1, 0'
      ├──(405, 406) 'key' : '3'
      ├──(428, 446) 'stu_num_names' : '0, Fred\n1, George\n'
      │  ├──(428, 429) 'stu_num' : '0'
      │  ├──(431, 435) 'name' : 'Fred'
      │  ├──(436, 437) 'stu_num' : '1'
      │  └──(439, 445) 'name' : 'George'
      └──(469, 478) 'stu_num_scores' : '0, 0\n1, 0'
         ├──(469, 470) 'stu_num' : '0'
         ├──(472, 473) 'score' : '0'
         ├──(474, 475) 'stu_num' : '1'
         └──(477, 478) 'score' : '0'

                                         Name  Score
School         Grade Student number                 
Hogwarts       1     0                  Ginny      8
                     1                   Luna      7
               2     0                  Harry      5
                     1               Hermione     10
               3     0                   Fred      0
                     1                 George      0
Riverdale High 1     0                 Phoebe      3
                     1                 Rachel      7
               2     0                 Angela      6
                     1                Tristan      3
                     2                 Aurora      9
rlayers
  • 21
  • 3