0

I have a script that read data from stdin such as:

#################################
# Retrieve NMON data from stdin #
#################################

# Read nmon data from stdin

data = sys.stdin.readlines()

Then a part of the code transforms and generates data using regex searches:

###################
# Dynamic Sections : data requires to be transposed to be exploitable within Splunk
###################

dynamic_section = ["DISKBUSY","DISKBSIZE","DISKREAD","DISKWRITE"]

for section in dynamic_section:

    # Set output file (will be opened for writing after data transposition)
    currsection_output = DATA_DIR + HOSTNAME + '_' + day + '_' + month + '_' + year + '_' + hour + minute + second + '_' + section + '.nmon.csv'

    # Open StringIO for temp in memory
    buffer = cStringIO.StringIO()

    # counter
    count = 0

    for line in data:

        if find_section:

            # csv header

            # Replace some symbols
            line=re.sub("%",'_PCT',line)
            line=re.sub(" ",'_',line)
            line=re.sub("\+",'',line)
            line=re.sub("\(",'_',line)              
            line=re.sub("\)",'_',line)              
            line=re.sub(" ",'_',line)
            line=re.sub("\.",'_',line)

            # Extract header excluding data that always has Txxxx for timestamp reference
            myregex = '(' + section + ')\,([^T].+)'
            fullheader_match = re.search( myregex, line)            

            if fullheader_match:
                fullheader = fullheader_match.group(2)

                header_match = re.match( r'([a-zA-Z\-\/\_0-9]+,)([a-zA-Z\-\/\_0-9\,]*)', fullheader)    

                if header_match:
                    header = header_match.group(2)

                    # Write header
                    buffer.write('ZZZZ' + ',' + header + '\n'),


            # Extract timestamp

            # Nmon V9 and prior do not have date in ZZZZ
            # If unavailable, we'll use the global date (AAA,date)
            ZZZZ_DATE = '-1'
            ZZZZ_TIME = '-1'                

            # For Nmon V10 and more             

            timestamp_match = re.match( r'^ZZZZ\,(.+)\,(.+)\,(.+)\n', line)
            if timestamp_match:
                ZZZZ_TIME = timestamp_match.group(2)
                ZZZZ_DATE = timestamp_match.group(3)            

                # Convert month names to numbers
                month_to_numbers = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}         

                for k, v in month_to_numbers.items():
                    ZZZZ_DATE = ZZZZ_DATE.replace(k, v)

                ZZZZ_timestamp = ZZZZ_DATE + ' ' + ZZZZ_TIME

            # For Nmon V9 and less                  

            if ZZZZ_DATE == '-1':
                ZZZZ_DATE = DATE
                timestamp_match = re.match( r'^ZZZZ\,(.+)\,(.+)\n', line)
                if timestamp_match:
                    ZZZZ_TIME = timestamp_match.group(2)                    

                    # Convert month names to numbers
                    month_to_numbers = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}         

                    for k, v in month_to_numbers.items():
                        ZZZZ_DATE = ZZZZ_DATE.replace(k, v)

                    ZZZZ_timestamp = ZZZZ_DATE + ' ' + ZZZZ_TIME

            # Extract Data
            myregex = r'^' + section + '\,(T\d+)\,(.+)\n'
            perfdata_match = re.match( myregex, line)
            if perfdata_match:
                perfdata = perfdata_match.group(2)

                # Write perf data
                buffer.write(ZZZZ_timestamp + ',' + perfdata + '\n'),


    # Open final for writing
    with open(currsection_output, "w") as currsection:

        # Rewind temp
        buffer.seek(0)

        writer = csv.writer(currsection)
        writer.writerow(['type', 'serialnum', 'hostname', 'ZZZZ', 'device', 'value'])           

        # increment
        count += 1

        for d in csv.DictReader(buffer):
            ZZZZ = d.pop('ZZZZ')
            for device, value in sorted(d.items()):

                # increment
                count += 1

                row = [section, SN, HOSTNAME, ZZZZ, device, value]
                writer.writerow(row)            

        # End for

    # Show number of lines extracted
    result = section + " section: Wrote" + " " + str(count) + " lines"
    print (result)
    ref.write(result + "\n")

    # Discard memory buffer 
    buffer.close()  

# End for

How can i prevent from entering the loop (the main for section) if the data is not present in content retrieved from stding ? (and stored in data)

Thank your for your help !

Guilmxm
  • 47
  • 1
  • 9
  • 1
    Can't you just do a simple if-statement? I.E. `if data: ... Go on with the for-loop, else: pass`. Or am I misinterpreting your question? – Perry Jul 30 '14 at 11:50
  • I think so but can i search a pattern in data without going line by line ? – Guilmxm Jul 30 '14 at 12:28
  • What do you want to do exactly? From a bit of googling and looking at your code it seems that you want to extract data from a data source (namely nmon-data) that seem to have some structure attached to it. Maybe you should check out if you can get this data in a structured format CSV/XML etc. and it'll be easier/faster to extract the things you want. However, if your goal is just to extract some arbitrary string from a text file, then yes, going "line by line" is the best option you have. – Perry Jul 30 '14 at 12:41
  • Thanks for answering. Yes it structured data but it can't be read like normal csv or other format, that's why i need to extract the data using regex stuff. What i want is to prevent from entering the for if the section is not present in the data. I already go line by line in the loop to extract the data. For example, if in data there no at least one line like 'DISKBUSY,T.+' (regex) then there is no need to enter the loop But re search can only be done over line in data no ? Python gives me error if if try to search over data and not line in data – Guilmxm Jul 30 '14 at 12:52
  • like would do a grep in shell, then a if based on return code – Guilmxm Jul 30 '14 at 12:59
  • Something like this? data=sys.stdin.read().replace('\n', '') (see: http://stackoverflow.com/questions/8369219/how-do-i-read-a-text-file-into-a-string-variable-in-python). And then simply if [dsec for dsec in dynamic_section if dsec in data].. READ DATA. else: pass? Is it something like that you're looking for? Not the most elegant thing, but the shortest line I could come up with at the moment – Perry Jul 30 '14 at 13:33
  • Perry, thanks i'm testing this, but changing stdin read from readline to read totally breaks my code and re match searches... – Guilmxm Jul 30 '14 at 20:30
  • Hi Perry, After a numerous number of tests, i could not get anything to work as expected... ! Here is my full code: http://pastebin.com/LpDt2vpg If you do have any idea how to optimize these loops, please do not hesitate, i would greatly appreciate :-) Thanks ! – Guilmxm Jul 31 '14 at 23:12
  • Perry, thank you having answered and tried to help me, i found a way to process – Guilmxm Aug 01 '14 at 19:30

1 Answers1

0

I coud finally found a way to process by adding a loop / counter before the main loop, such as:

dynamic_section = ["IOADAPT","NETERROR","NET","NETPACKET","JFSFILE","JFSINODE"]

for section in dynamic_section:

    # counter
    count = 0

    for line in data:

        # Extract sections, and write to output
        myregex = r'^' + section + '\,(T\d+)\,(.+)\n'
        find_section = re.match( myregex, line)
        if find_section:

            # increment
            count += 1

    if count > 2:

And the rest of the code.

At the end, the full piece of code:

dynamic_section = ["IOADAPT","NETERROR","NET","NETPACKET","JFSFILE","JFSINODE"]

for section in dynamic_section:

    # counter
    count = 0

    for line in data:

        # Extract sections, and write to output
        myregex = r'^' + section + '\,(T\d+)\,(.+)\n'
        find_section = re.match( myregex, line)
        if find_section:

            # increment
            count += 1

    if count > 2:

        # Set output file (will be opened for writing after data transposition)
        currsection_output = DATA_DIR + HOSTNAME + '_' + day + '_' + month + '_' + year + '_' + hour + minute + second + '_' + section + '.nmon.csv'

        # Open StringIO for temp in memory
        buffer = cStringIO.StringIO()

        # counter
        count = 0

        for line in data:

            # Extract sections, and write to output
            myregex = r'^' + section + '[0-9]*' + '|ZZZZ.+'
            find_section = re.match( myregex, line)

            if find_section:

                # csv header

                # Replace some symbols
                line=re.sub("%",'_PCT',line)
                line=re.sub(" ",'_',line)
                line=re.sub("\+",'',line)
                line=re.sub("\(",'_',line)              
                line=re.sub("\)",'_',line)              
                line=re.sub(" ",'_',line)

                # Extract header excluding data that always has Txxxx for timestamp reference
                myregex = '(' + section + ')\,([^T].+)'
                fullheader_match = re.search( myregex, line)            

                if fullheader_match:
                    fullheader = fullheader_match.group(2)

                    # Replace "." by "_" only for header
                    fullheader=re.sub("\.",'_',fullheader)

                    header_match = re.match( r'([a-zA-Z\-\/\_0-9]+,)([a-zA-Z\-\/\_0-9\,]*)', fullheader)    

                    if header_match:
                        header = header_match.group(2)

                        # Write header
                        buffer.write('ZZZZ' + ',' + header + '\n'),


                # Extract timestamp

                # Nmon V9 and prior do not have date in ZZZZ
                # If unavailable, we'll use the global date (AAA,date)
                ZZZZ_DATE = '-1'
                ZZZZ_TIME = '-1'                

                # For Nmon V10 and more             

                timestamp_match = re.match( r'^ZZZZ\,(.+)\,(.+)\,(.+)\n', line)
                if timestamp_match:
                    ZZZZ_TIME = timestamp_match.group(2)
                    ZZZZ_DATE = timestamp_match.group(3)            

                    # Convert month names to numbers
                    month_to_numbers = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}         

                    for k, v in month_to_numbers.items():
                        ZZZZ_DATE = ZZZZ_DATE.replace(k, v)

                    ZZZZ_timestamp = ZZZZ_DATE + ' ' + ZZZZ_TIME

                # For Nmon V9 and less                  

                if ZZZZ_DATE == '-1':
                    ZZZZ_DATE = DATE
                    timestamp_match = re.match( r'^ZZZZ\,(.+)\,(.+)\n', line)
                    if timestamp_match:
                        ZZZZ_TIME = timestamp_match.group(2)                    

                        # Convert month names to numbers
                        month_to_numbers = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}         

                        for k, v in month_to_numbers.items():
                            ZZZZ_DATE = ZZZZ_DATE.replace(k, v)

                        ZZZZ_timestamp = ZZZZ_DATE + ' ' + ZZZZ_TIME

                # Extract Data
                myregex = r'^' + section + '\,(T\d+)\,(.+)\n'
                perfdata_match = re.match( myregex, line)
                if perfdata_match:
                    perfdata = perfdata_match.group(2)

                    # Write perf data
                    buffer.write(ZZZZ_timestamp + ',' + perfdata + '\n'),


        # Open final for writing
        with open(currsection_output, "w") as currsection:

            # Rewind temp
            buffer.seek(0)

            writer = csv.writer(currsection)
            writer.writerow(['type', 'serialnum', 'hostname', 'ZZZZ', 'device', 'value'])           

            # increment
            count += 1

            for d in csv.DictReader(buffer):
                ZZZZ = d.pop('ZZZZ')
                for device, value in sorted(d.items()):

                    # increment
                    count += 1

                    row = [section, SN, HOSTNAME, ZZZZ, device, value]
                    writer.writerow(row)            

            # End for

        # Show number of lines extracted
        result = section + " section: Wrote" + " " + str(count) + " lines"
        print (result)
        ref.write(result + "\n")

        # Discard memory buffer 
        buffer.close()  

    # End for
Guilmxm
  • 47
  • 1
  • 9