1

I want to convert text to csv. Input file contains 10000K lines. Sample of input file is as below:-

Item=a
Price=10
colour=pink


Item=b
Price=20
colour=blue Pattern=checks

My output should look like this

Item Price Colour Pattern

a    10    pink
b    20    blue   checks

My code is as below

import csv
import glob
import os

def dat_to_csv(filename, table_name):
    with open(filename, 'r',errors='ignore') as reader:
        list_of_columns = []
        table_values = []
        master_table = []
        counter = 0
        column_name1 = []
        value1 = []
        column_name2 = []
        value2 = []
        column_name3 = []
        value3 = []
        column_name4 = []
        value4 = []
        lines_after_23 = reader.readlines()[23:]
        for line in lines_after_23:
        #stripped_line = line.strip()
            if line.startswith("#"):
                continue
            if line.startswith("Associate"):
                continue
            if line == "\n":
                if (table_values):
                    master_table.append([])
                    master_table[counter] = table_values.copy()
                    counter = counter + 1
                    length = len(table_values)
                    for element in range(length):
                        table_values[element] = []
                continue 
            if line == "\n":
                continue

            if line == "\n":
                master_table.append([])
                master_table[counter] = table_values.copy()
                counter = counter + 1
                length = len(table_values)
                for element in range(length):
                    table_values[element] = []
                break

            extra_stripped_line = ' '.join(line.split())
             

            data = extra_stripped_line.split("=",1)
            column_name = data[0].strip()
            if '=' in data[1].strip():
                data1 = data[1].split(" ",1)
                value = data1[0].strip()
                data2 = data1[1].split("=",1)
                column_name1 = data2[0].strip()
                
                if '=' in data2[1].strip():
                    column_name2 = []
                    value2 = []
                    data3 = data2[1].split(" ",1)
                    value1 = data3[0].strip()
                    data4 = data3[1].split("=",1)
                    column_name2 = data4[0].strip()
                    
                    if '=' in data4[1].strip():
                        data5 = data4[1].split(" ",1)
                        value2 = data5[0].strip()
                        data6 = data5[1].split("=",1)
                        column_name3 = data6[0].strip()
                        
                        if '=' in data6[1].strip():
                            data7 = data6[1].split(" ",1)
                            value3 = data7[0].strip()
                            data8 = data7[1].split("=",1)
                            column_name4 = data8[0].strip()
                            
                            if '=' in data8[1].strip():
                                data9 = data8[1].split(" ",1)
                                value3 = data9[0].strip()
                                data10 = data9[1].split("=",1)
                                column_name4 = data10[0].strip()
                                value4 = data10[1].strip()
                            else:
                                value4 = data8[1].strip()
                        else:
                            value3 = data6[1].strip()
                        
                    else:
                        value2 = data4[1].strip()
                    
                else:
                    value1 = data2[1].strip()
            else:
                value = data[1].strip()  
                                
            if column_name not in list_of_columns:
                list_of_columns.append(column_name)
                table_values.append([])
                
            if column_name1 is not []:
                if column_name1 not in list_of_columns:
                    list_of_columns.append(column_name1)
                    table_values.append([])
                
            if column_name2 is not []:
                if column_name2 not in list_of_columns:
                    list_of_columns.append(column_name2)
                    table_values.append([]) 
                
            if column_name3 is not []:
                if column_name3 not in list_of_columns:
                    list_of_columns.append(column_name3)
                    table_values.append([]) 
                
            if column_name4 is not []:
                if column_name4 not in list_of_columns:
                    list_of_columns.append(column_name4)
                    table_values.append([])       

            index = list_of_columns.index(column_name)
            if column_name1 is not []:
                index1 = list_of_columns.index(column_name1)
            if column_name2 is not []:
                index2 = list_of_columns.index(column_name2)
            if column_name3 is not []:
                index3 = list_of_columns.index(column_name3)
            if column_name4 is not []:
                index4 = list_of_columns.index(column_name4)

            #table_values[index].append(value)
            table_values[index] = value
            if value1 is not []:
                table_values[index1] = value1
            if value2 is not []:
                table_values[index2] = value2
            if value3 is not []:
                table_values[index3] = value3
            if value4 is not []:
                table_values[index4] = value4
             
        #with open("output\\{}.csv".format(table_name), 'w', newline='') as csvfile:
        with open("yourpath\\{}.csv".format(table_name), 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
            writer.writerow(list_of_columns)

            #t_table_values = zip(*table_values)
            max_elements = len(master_table)
            master_table_transp = []
            cntr = 0
            for cntr in range(max_elements):
                master_table_transp.append([])
            num_objects = len(master_table)

            for cntr_obj in range(num_objects):
                for cntr_row in range(max_elements):
                    if (cntr_row<len(master_table[cntr_obj])):
                        master_table_transp[cntr_row].append(master_table[cntr_obj][cntr_row])
                    else:
                        master_table_transp[cntr_row].append([])
                        
                   
            
            t_table_values = zip(*master_table_transp)
                       
            for values in t_table_values:
                writer.writerow(values)
                
            if value1 is not []:
                for value1s in t_table_values:
                    writer.writerow(value1s)
                
            if value2 is not []:
                    for value2s in t_table_values:
                        writer.writerow(value2s)
                
            if value3 is not []:
                for value3s in t_table_values:
                    writer.writerow(value3s)
                
            if value4 is not []:
                for value4s in t_table_values:
                    writer.writerow(value4s) 
                
                    
if __name__ == '__main__':
    path = "your path" 
    for filename in glob.glob((os.path.join(path, '*.dat'))):
        name_only = os.path.basename(filename).replace(".dat", "")
        dat_to_csv(filename, name_only)

I am getting required output but there are few issues:-

  1. a blank column at 2nd position is being created
  2. last item "item=n" is not showing in my csv. rest all items showing
  • 2
    make a [mcve] so we can copy and run the code. – Mark Tolonen Feb 25 '22 at 18:48
  • 1
    "if there are more than 1 like 2/3 '=' then I am not sure" please [edit] your sample of the input file to show some lines where your current code _doesn't_ work with the expected output of those lines so we can better understand – G. Anderson Feb 25 '22 at 18:49
  • I have uploaded my whole code, you can directly run it by just editing the folder path at 2 locations, in text you can copy whatever I have mentioned in input file. –  Feb 25 '22 at 19:02
  • 1
    _Minimal_ reproducible example, please. Now's a good time to learn [how to debug small programs](//ericlippert.com/2014/03/05/how-to-debug-small-programs/) and [to use a debugger](/q/25385173) Step through your code and observe what each line of code does. Identify where your program differs from your expectations by comparing these intermediate results with expected results. Work backwards from there to narrow down the cause of the problem. This will help you to remove as much irrelevant code from your MRE and often you'll even figure out how to fix it without asking on SO. – Pranav Hosangadi Feb 25 '22 at 19:08
  • Thank you all for your answers, are answers were great but I figured out there was a typo error from my side so code was not working as expected, now I have updated my code and here but I am still getting 3 issues. Can someone let me know what is the issue here? I am very new to python –  Mar 02 '22 at 14:06
  • I just read part of your code for the first time, and your sample data doesn’t represent the problem… no 23 lines to skip, no ‘Associate’ line to skip. I’ve added another answer which more closely matches what you need. You’ll need to read through the first 23 lines before you start parsing the data. – Zach Young Mar 04 '22 at 16:30
  • You shouldn’t need as much code as you written to solve this problem. I see you’re new and you’re trying. Please run the code we’ve offered and let us know if and how it does not work. Good luck! – Zach Young Mar 04 '22 at 16:36

3 Answers3

2

Unless I've misunderstood something then it's as simple as this:

from pandas import DataFrame
from numpy import nan

master = [dict()]

with open('foo.txt') as foo:
    for line in foo:
        if (line := line.strip()):
            for token in line.split():
                k, v = token.split('=')
                master[-1][k] = v
        elif master[-1]:
            master.append(dict())

if not master[-1]:
    del master[-1]

if master:
    df = DataFrame(master).replace(nan, '', regex=True)
    df.to_csv('foo.csv', index=False)

Output (of the csv file):

Item,Price,colour,Pattern
a,10,pink,
b,20,blue,checks
DarkKnight
  • 19,739
  • 3
  • 6
  • 22
  • Pretty clean solution. The whole if/if-not `master[-1]` took a bit of puzzling to figure out (both syntactically and semantically), maybe add a comment like, "only add new dict if last dict was filled" and "remove potentially empty dict from end of list"? – Zach Young Feb 25 '22 at 21:12
1

With some assumptions this works. I added some test cases. This does require all the records to fit in memory, but if you know all the possible column names in advance you can set columns accordingly and write the lines as generated instead of all at the end. Even with 10000K (10M) records unless the records are really large that will easily fit in a modern system memory.

input.csv

Item=a
Price=10

Item=b
Price=20
colour=blue Pattern=checks

Item=c
Price=5

Item=d Price=25 colour=blue

Item=e colour===FANCY== Price=1/2=$1

test.py

from collections import defaultdict
import csv

columns = {}
lines = []

with open('input.txt') as fin:
    for line in fin:
        if not line.strip(): # write record on blank line
            needs_flush = False
            lines.append(columns)
            # blank all the columns to start next record.
            columns = {k:'' for k in columns}
            continue

        # assume multiple items on a line are separated by a single space
        items = line.strip().split(' ')
        
        # assume column name is before first = sign in each item
        for column,value in [item.split('=',1) for item in items]:
            needs_flush = True
            columns[column] = value

    # write record on EOF if hasn't been flushed
    if needs_flush:
        lines.append(columns)

# dump records to CSV
with open('output.csv','w',newline='') as fout:
    writer = csv.DictWriter(fout, fieldnames=columns)
    writer.writeheader()
    writer.writerows(lines)

output.csv:

Item,Price,colour,Pattern
a,10,,
b,20,blue,checks
c,5,,
d,25,blue,
e,1/2=$1,==FANCY==,
Mark Tolonen
  • 166,664
  • 26
  • 169
  • 251
0

I like to make little state machines for these kinds of problems, since as much as we'd all like to believe the sample data matches the real world, there are probably some gotchas, and you want a solution that's flexible.

For me, that flexibility means as I'm looping over the input lines:

  • avoid many nested structures
  • do all processing in one place; at the end of the loop, once I know this line is worth processing
  • for processing, try to break down the steps and still keep a flat/linear flow...
    • how do I go from a line to fields?
    • how do I go from a field to a key/value pair?
    • how do I use the key-value pair?

And as you try it out, try starting out with smaller samples of your input and build up to the whole, big thing.

#!/usr/bin/env python3
import csv

field_names = {}  # use dict as ordered set to collect all field names as data is parsed
records = []
record = None
with open('input.txt') as f:
    for line in f:
        line = line.strip()

        if line.startswith('Item'):
            record = {}

        if record is None:
            continue

        if line == '':
            records.append(record)
            record = None
            continue

        # Finally, line must be data in a record, parse it
        fields = line.split(' ')
        kvps = [field.split('=', 1) for field in fields]  # 1 in split('=', 1) is for the `===FANCY==` example @MarkTolonen threw at us
        kvp_dict = dict(kvps)

        record.update(kvp_dict)
        field_names.update(kvp_dict)                      # pass in keys & vals (it's simpler) even if we only need the keys



# Deal with "straggling record" (if your input ends with a line of data (and not an empty line))
if record is not None:
    records.append(record)


out_f = open('output.csv', 'w', newline='')
writer = csv.DictWriter(out_f, fieldnames=field_names)
writer.writeheader()
writer.writerows(records)

Here's my output.csv:

| Item | Price  | colour    | Pattern |
|------|--------|-----------|---------|
| a    | 10     | pink      |         |
| b    | 20     | blue      | checks  |
| aa   | 10     |           |         |
| bb   | 20     | blue      | checks  |
| cc   | 5      |           |         |
| dd   | 25     | blue      |         |
| ee   | 1/2=$1 | ==FANCY== |         |
Zach Young
  • 10,137
  • 4
  • 32
  • 53