0

I'm getting some string data from an irrelevant source, which contains (among other text) lines that follow a pattern similar to ^INSERT INTO \w VALUES (\d, .*$. What I want to do is to sort these lines based on the first (and if second is numeric, then also the second) column's value.

The code I have so far works fine for lines that have a strict increment in the first value, but when the values are duplicated, the logic breaks and undefined is output instead. I'd also like to know how I could sort based on the second number in the event of a duplication in the first one.

Sorting by multiple properties in itself would not be an issue for me, the problem is with the way my objects are set up (they don't allow for duplicate IDs because of the use of 1 object key per ID).

var data = 
"--\n\
-- PostgreSQL database dump\n\
--\n\
\n\
SET statement_timeout = 0;\n\
SET lock_timeout = 0;\n\
SET client_encoding = 'UTF8';\n\
SET standard_conforming_strings = on;\n\
SET check_function_bodies = false;\n\
SET client_min_messages = warning;\n\
\n\
-- Working fine\n\
INSERT INTO c VALUES (3, 'c', 45);\n\
INSERT INTO c VALUES (1, 'a', 11);\n\
INSERT INTO c VALUES (5, 'e', 77);\n\
INSERT INTO c VALUES (4, 'd', 76);\n\
INSERT INTO c VALUES (2, 'b', 33);\n\
\n\
--\n\
-- Name: a; Type: TABLE; Schema: public\n\
--\n\
CREATE TABLE a (\n\
    first integer NOT NULL,\n\
    second integer NOT NULL\n\
);\n\
\n\
--\n\
-- Name: a_first_second; Type: CONSTRAINT; Schema: public\n\
--\n\
ALTER TABLE ONLY a\n\
    ADD CONSTRAINT a_first_second PRIMARY KEY (first, second);\n\
\n\
-- Breaks\n\
INSERT INTO a VALUES (1, 1);\n\
INSERT INTO a VALUES (2, 5);\n\
INSERT INTO a VALUES (4, 2);\n\
INSERT INTO a VALUES (5, 6);\n\
INSERT INTO a VALUES (2, 4);\n\
INSERT INTO a VALUES (3, 7);\n\
INSERT INTO a VALUES (4, 6);\n\
INSERT INTO a VALUES (2, 7);\n\
INSERT INTO a VALUES (6, 9);\n\
\n\
-- All good here\n\
INSERT INTO b VALUES (2, 'b', 'Description');\n\
INSERT INTO b VALUES (3, 'c', 'Description\n\
2nd line of description');\n\
INSERT INTO b VALUES (1, 'a', 'Description');\n\
INSERT INTO b VALUES (4, 'd', 'Description\n\
2nd line of description\n\
3rd line of description');\n\
INSERT INTO b VALUES (5, 'e', 'Description');";
/*/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////*/document.body.innerHTML='<code><pre></pre></code>';console = {log:function(input){document.getElementsByTagName('pre')[0].innerHTML+=input+'\n'}};
var groups = {},
    test = /INSERT INTO "?([a-z]+)"?\s*VALUES\s*\((\d+),[\s\S]+?;/g;
data.replace(test,function(row,group,field){
  if (typeof groups[group] !== 'object')
    groups[group] = {};
  groups[group][field] = row;
  return row;
});
var sortedGroupKeys = {},
    groupStep = {};
for (var j = 0, k = Object.keys(groups), l = k.length; j<l; j++){
  var group = k[j];
  sortedGroupKeys[group] = Object.keys(groups[group]).sort(function(a,b){
      return parseInt(a, 10) - parseInt(b, 10);
  });
  groupStep[group] = 0;
}
data = data.replace(test,function(row,group){
  var nextSortedKeyIndex = groupStep[group]++,
      nextSortedKey = sortedGroupKeys[group][nextSortedKeyIndex];

  return groups[group][nextSortedKey];
});
console.log(data);
SeinopSys
  • 8,787
  • 10
  • 62
  • 110
  • How many rows are there likely to be? Is it OK to have two copies of the string in memory? – Dean Taylor Dec 27 '15 at 03:41
  • @DeanTaylor Over 1000 rows and counting. The code runs on my development machine in Node.js, so memory issues are not a great concern. – SeinopSys Dec 27 '15 at 03:43
  • Is each `INSERT INTO` entire row always going to be on a single line and not broken over multiple lines? – Dean Taylor Dec 27 '15 at 03:56
  • @DeanTaylor No, sometimes when the line contains a line break in the text it'll split the `INSERT` to multiple lines. I'll edit the example to illustrate this. Lines where sorting by 2 columns is necessary don't have any extra data in them, only the numbers themselves. – SeinopSys Dec 27 '15 at 03:57

2 Answers2

1

The basic outline here is:

  • Split and iterate over the blocks of table inserts
    • Split into "insert commands rows"
    • Sort the rows
    • Join the rows
  • Join the blocks

var data = 
"-- Working fine\n\
INSERT INTO c VALUES (3, 'c', 45);\n\
INSERT INTO c VALUES (1, 'a', 11);\n\
INSERT INTO c VALUES (5, 'e', 77);\n\
INSERT INTO c VALUES (4, 'd', 76);\n\
INSERT INTO c VALUES (2, 'b', 33);\n\
\n\
-- Breaks (but not really broken anymore)\n\
INSERT INTO a VALUES (1, 1);\n\
INSERT INTO a VALUES (2, 5);\n\
INSERT INTO a VALUES (6, 11);\n\
INSERT INTO a VALUES (6, 20);\n\
INSERT INTO a VALUES (4, 2);\n\
INSERT INTO a VALUES (5, 6);\n\
INSERT INTO a VALUES (2, 4);\n\
INSERT INTO a VALUES (3, 7);\n\
INSERT INTO a VALUES (4, 6);\n\
INSERT INTO a VALUES (2, 7);\n\
INSERT INTO a VALUES (6, 9);\n\
INSERT INTO a VALUES (6, 10);\n\
\n\
-- All good here\n\
INSERT INTO b VALUES (2, 'b', 'Description');\n\
INSERT INTO b VALUES (3, 'c', 'Description\n\
2nd line of description');\n\
INSERT INTO b VALUES (1, 'a', 'Description');\n\
INSERT INTO b VALUES (4, 'd', 'Description\n\
2nd line of description\n\
3rd line of description');\n\
INSERT INTO b VALUES (5, 'e', 'Description');";
/*/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////*/document.body.innerHTML='<code><pre></pre></code>';console = {log:function(input){document.getElementsByTagName('pre')[0].innerHTML+=input+'\n'}};

var blockRegex = /((?:\bINSERT\s+INTO\s+[^\s]+\s+VALUES\s*\((?:[^"')]+|"[^"\\]*(?:\\.[^"\\]*)*"|'[^'\\]*(?:\\.[^'\\]*)*')*\);[\r\n\\]*)+)/i,
    rowsRegex = /(?:\bINSERT\s+INTO\s+[^\s]+\s+VALUES\s*\((?:[^"')]+|"[^"\\]*(?:\\.[^"\\]*)*"|'[^'\\]*(?:\\.[^'\\]*)*')*\);[\r\n\\]*)/ig,
    result = data.split(blockRegex),
    resultRows;

for (var i = 0; i < result.length; i++) {
 resultRows = result[i].match(rowsRegex);
 if (resultRows) {
   resultRows.alphanumSort();
   result[i] = resultRows.join('');
 }
}
 data = result.join('');

console.log(data);
<script src="http://www.davekoelle.com/files/alphanum.js"></script>

Note

This uses a Natural Sorting function from here: http://www.davekoelle.com/alphanum.html

Dean Taylor
  • 40,514
  • 3
  • 31
  • 50
  • Your regexes contain `[\r\n\\n]` which I suspect is a typo, as it also checks for repeated occurrences of `backslash` and `n` – SeinopSys Dec 27 '15 at 04:48
  • @DJDavid98 check again I removed it earlier - it was for testing when I was playing with escaped `\n` values. The additional `backslash` check may not be needed either - it depends on how you are doing your multi lines etc. – Dean Taylor Dec 27 '15 at 04:49
  • I've integrated this code into my Gulp task and it does not appear to modify the file at all, I'll keep on testing as to why that happens as it could be an issue on my end. – SeinopSys Dec 27 '15 at 04:57
  • For me `data.split(blockRegex)` returns an array with the entire `data` contents as the first and only element. Here's a link to [the PostgeSQL file](https://raw.githubusercontent.com/ponydevs/MLPVC-RR/master/setup/mlpvc-colorguide.pg.sql) I'm testing on in case it helps with debugging. – SeinopSys Dec 27 '15 at 05:01
  • FIX: slight problem with `[^\s+]` vs the correct `[^\s]+` (effectively only allowing single character table name). Also added support for multiple spaces between table name and `VALUES`. – Dean Taylor Dec 27 '15 at 06:05
  • Screws up when sorting rows with IDs `>=10` – SeinopSys Dec 27 '15 at 06:09
  • Yeah, you didn't specify that the numbers would go over `9` :) .... just switch out the sort for a [natural sort](http://stackoverflow.com/questions/2802341/javascript-natural-sort-of-alphanumerical-strings). – Dean Taylor Dec 27 '15 at 06:10
  • I did mention the file was over 1000 lines. While entirely possible, it's hard to image a DB full of tables with less than 10 rows that require ~1k rows of inserts. I appreciate your help, but I found a more fail safe solution on my own after some helpful pointers from Shaunt. – SeinopSys Dec 27 '15 at 06:15
  • It was a joke - updated to use a natural sorting function. – Dean Taylor Dec 27 '15 at 06:16
  • Can confirm it's working now so I'll keep the upvote in, but I'm sticking to my solution as I can probably fine-tune it easier later if I need to. – SeinopSys Dec 27 '15 at 06:22
0

Building on @Shaunt's suggestion, I changed the script to use an array of complete statements stored on a per-table basis, which I then parse separately inside a custom sort function.

var parseRow = function(r){
      var match = r.match(/VALUES \((\d+)(?:, (\d+|NULL))?[, )]/);
      if (!match)
        return [];
      return [match[1], match[2]];
    }, 
    test = /INSERT INTO "?([a-z]+)"?\s*VALUES\s*\((\d+),[\s\S]+?;/g,
    Tables = {},
    TableCounters = {};

data.replace(test,function(row,table){
  if (typeof Tables[table] !== 'object')
    Tables[table] = [];
  Tables[table].push(row);
  TableCounters[table] = 0;
  return row;
});
for (var j = 0, k = Object.keys(Tables), l = k.length; j<l; j++){
  var table = k[j];
  Tables[table].sort(function(a,b){
    a = parseRow(a);
    b = parseRow(b);

    var ix = 0;
    if (a[0] === b[0] && a[1] !== 'NULL' && b[1] !== 'NULL')
      ix++;

    a[ix] = parseInt(a[ix], 10);
    b[ix] = parseInt(b[ix], 10);

    return a[ix] > b[ix] ? 1 : (a[ix] < b[ix] ? -1 : 0);
  })
}
data = data.replace(test,function(row,table){
  var nextRowIndex = TableCounters[table]++;

  return Tables[table][nextRowIndex];
});

Working example:

var data = 
"--\n\
-- PostgreSQL database dump\n\
--\n\
\n\
SET statement_timeout = 0;\n\
SET lock_timeout = 0;\n\
SET client_encoding = 'UTF8';\n\
SET standard_conforming_strings = on;\n\
SET check_function_bodies = false;\n\
SET client_min_messages = warning;\n\
\n\
-- Working fine\n\
INSERT INTO c VALUES (3, 'c', 45);\n\
INSERT INTO c VALUES (1, 'a', 11);\n\
INSERT INTO c VALUES (5, 'e', 77);\n\
INSERT INTO c VALUES (4, 'd', 76);\n\
INSERT INTO c VALUES (2, 'b', 33);\n\
\n\
--\n\
-- Name: a; Type: TABLE; Schema: public\n\
--\n\
CREATE TABLE a (\n\
    first integer NOT NULL,\n\
    second integer NOT NULL\n\
);\n\
\n\
--\n\
-- Name: a_first_second; Type: CONSTRAINT; Schema: public\n\
--\n\
ALTER TABLE ONLY a\n\
    ADD CONSTRAINT a_first_second PRIMARY KEY (first, second);\n\
\n\
-- Breaks\n\
INSERT INTO a VALUES (1, 1);\n\
INSERT INTO a VALUES (2, 5);\n\
INSERT INTO a VALUES (4, 2);\n\
INSERT INTO a VALUES (5, 6);\n\
INSERT INTO a VALUES (2, 4);\n\
INSERT INTO a VALUES (3, 7);\n\
INSERT INTO a VALUES (4, 6);\n\
INSERT INTO a VALUES (2, 7);\n\
INSERT INTO a VALUES (6, 9);\n\
\n\
-- All good here\n\
INSERT INTO b VALUES (2, 'b', 'Description');\n\
INSERT INTO b VALUES (3, 'c', 'Description\n\
2nd line of description');\n\
INSERT INTO b VALUES (1, 'a', 'Description');\n\
INSERT INTO b VALUES (4, 'd', 'Description\n\
2nd line of description\n\
3rd line of description');\n\
INSERT INTO b VALUES (5, 'e', 'Description');";
/*//////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////*/document.body.innerHTML='<code><pre></pre></code>';console = {log:function(input){document.getElementsByTagName('pre')[0].innerHTML+=input+'\n'}};
var parseRow = function(r){
      var match = r.match(/VALUES \((\d+)(?:, (\d+|NULL))?[, )]/);
      if (!match)
        return [];
      return [match[1], match[2]];
    }, 
    test = /INSERT INTO "?([a-z]+)"?\s*VALUES\s*\((\d+),[\s\S]+?;/g,
    Tables = {},
    TableCounters = {};

data.replace(test,function(row,table){
  if (typeof Tables[table] !== 'object')
    Tables[table] = [];
  Tables[table].push(row);
  TableCounters[table] = 0;
  return row;
});
for (var j = 0, k = Object.keys(Tables), l = k.length; j<l; j++){
  var table = k[j];
  Tables[table].sort(function(a,b){
    a = parseRow(a);
    b = parseRow(b);

    var ix = 0;
    if (a[0] === b[0] && a[1] !== 'NULL' && b[1] !== 'NULL')
      ix++;

    a[ix] = parseInt(a[ix], 10);
    b[ix] = parseInt(b[ix], 10);

    return a[ix] > b[ix] ? 1 : (a[ix] < b[ix] ? -1 : 0);
  })
}
data = data.replace(test,function(row,table){
  var nextRowIndex = TableCounters[table]++;

  return Tables[table][nextRowIndex];
});
console.log(data);
SeinopSys
  • 8,787
  • 10
  • 62
  • 110