1

I have a csv with 4 columns: type, state, year, value

There are multiple rows that share both the same type and state. I want to filter the csv so I only get one row for each type/state combination, where that row has the most recent year. I'm wondering how I can do this programmatically, as my data file will eventually be much, much longer. I'd like to do this with javascript if possible.

Here is my data:

type,state,year,value
    oranges,florida,1990,18.17
    oranges,florida,1980,14.52
    oranges,florida,2000,11.1
    oranges,florida,2010,9.8
    lemons,florida,1990,8.2
    lemons,florida,1980,6.2
    lemons,florida,1985,4.1
    lemons,florida,1987,5.87
    lemons,florida,2002,5.87
    lemons,florida,2003,460
    lemons,florida,2009,730
    lemons,florida,2010,1100
    lemons,florida,2011,1200
    lemons,florida,2012,1200
    lemons,florida,2013,55.5
    lemons,florida,2014,55.1
    lemons,florida,2015,53.1
    limes,florida,1991,49.9
    limes,florida,1992,45.6
    limes,florida,1993,41.8
    limes,florida,2002,3.23
    limes,florida,2003,3.23
    limes,florida,2009,10.767
    limes,florida,2011,34.34492
    oranges,california,1964,49.9
    oranges,california,1965,0.293
    oranges,california,1970,0.293
    oranges,california,1990,16
    oranges,california,1980,16
    oranges,california,1985,29.8
    oranges,california,1987,6.3
    oranges,california,2002,6.31
    oranges,california,2003,6.27
    oranges,california,2008,6.24
    oranges,california,2009,6.38
    oranges,california,2010,10.3
    oranges,california,2011,10.3
    oranges,california,2012,10.4
    oranges,california,2013,8.5
    oranges,california,2014,12.6
    oranges,california,2015,75.1
    lemons,california,1950,74.8
    lemons,california,1955,74.5
    lemons,california,1960,74.2
    lemons,california,1965,72.9
    lemons,california,1970,72.7
    limes,california,1990,72.4
    limes,california,1991,72.1
    limes,california,1992,102.56187
    limes,california,1993,102.25079
    limes,california,1994,96.70884
    limes,california,1995,88

I would want my results to be:

type,state,year,value
lemons,california,1970,72.7
limes,california,1995,88
oranges,california,2015,75.1
lemons,florida,2015,53.1
limes,florida,2011,34.34492
oranges,florida,2010,9.8
sprucegoose
  • 486
  • 9
  • 25

3 Answers3

0
  1. Create empty array
  2. Take csv line by line, in loop
  3. Split by comma to get fruit state and year
  4. If combination of fruit and state doesn't exists in array add to arrray
  5. Else check year, if more recent, replace older in array with new values

[EDIT] You can loop directly on array translated from csv:

Code example:

    var items = [];
    items['florida_oranges'] = 1990;

    //data taken from array list
    var newYear = 1994;
    var state = 'florida';
    var fruit = 'oranges';

    //checking
    var index = state + "_" + fruit;
    if(items[index] && items[index] < newYear) items[index] = newYear;
Community
  • 1
  • 1
pmaniora
  • 76
  • 5
  • How do I check if fruit and state don't exist? Would you be able to provide a code example? I'm not sure how to go about this. – sprucegoose Dec 13 '15 at 06:11
0

Here is an example of how it can be done in nodejs. I have kept the example small and simple...

What you need to run:

  • nodejs installed with
  • fast-csv module installed (npm install fast-csv)

  • Input file assumed (data.csv)

  • Output file will be (stats.csv)

//Code snippet

var fs = require('fs');

var csv = require('fast-csv');

var stats = [];

csv
 .fromPath("data.csv")
 .on("data", function(data){
        addOrUpdate(data);
 })
 .on("end", function(){
     console.log(stats);
     writeStats(stats);
 });

function addOrUpdate(item) {
        var found = false;
        for ( var i=0; i<stats.length; i++ ) {
                if ( stats[i][0] === item[0] && stats[i][1] === item[1] ) {
                        found = true;
                        if  (stats[i][3] < item[3]) {
                                stats[i][2] = item[2];
                                stats[i][3] = item[3];
                        }
                        break;
                }
        }
        if ( false == found) {
                stats.push(item);
        }
}

function writeStats(stats) {
  var csvStream =
    csv
     .createWriteStream({headers: false}),
        writableStream = fs.createWriteStream("stats.csv");
   csvStream.pipe(writableStream);
   for ( i=0; i<stats.length; i++ ) {
      csvStream.write(stats[i]);
   }
   csvStream.end();

}
Jas
  • 336
  • 2
  • 7
0

You could do this with this PHP code:

<?php
    $csvData = array_map('str_getcsv', file("input.csv")); // put csv file in an array
    $csvHeader = array_shift($csvData);    // chop off first element (header) and save in array
    $testArray = array();   // create array

    // iterate over rows in csvData
    for ($i = 0; $i < count($csvData); $i++) {
        // if state and type are NOT in array or, year selected row is greater than the one in array 
        if (!in_array($csvData[$i][0] . '_' . $csvData[$i][1], $testArray) || $testArray[$csvData[$i][0] . '_' . $csvData[$i][1]]['year'] > $csvData[$i][2]) {
            // build multidimensional array
            $testArray[$csvData[$i][0] . '_' . $csvData[$i][1]] = array(
                'type' => $csvData[$i][0],
                'state' => $csvData[$i][1],
                'year' => $csvData[$i][2],
                'value' => $csvData[$i][3],
            );

        }

    }

    array_unshift($testArray, $csvHeader);  // put header back
    $fp = fopen('output.csv', 'w'); // open file

    // fill csv file
    foreach($testArray as $fields) {
        fputcsv($fp, $fields);
    }

    fclose($fp);    // close file
?>
bjb568
  • 11,089
  • 11
  • 50
  • 71