1

I have a very large JSON Lines File with 4.000.000 Rows, and I need to convert several events from every row. The resulted CSV File contains 15.000.000 rows. How can I optimize this script?

I'm using Powershell core 7 and it takes around 50 hours to complete the conversion.

My Powershell script:

$stopwatch =  [system.diagnostics.stopwatch]::StartNew()
$totalrows = 4000000

$encoding = [System.Text.Encoding]::UTF8    
$i = 0
$ig = 0
$output = @()

$Importfile = "C:\file.jsonl"
$Exportfile = "C:\file.csv"

if (test-path $Exportfile) {
    Remove-Item -path $Exportfile
}

foreach ($line in [System.IO.File]::ReadLines($Importfile, $encoding)) {
    $json = $line | ConvertFrom-Json

    foreach ($item in $json.events.items) {
    $CSVLine = [pscustomobject]@{
    Key = $json.Register.Key
    CompanyID = $json.id
    Eventtype = $item.type
    Eventdate = $item.date
    Eventdescription = $item.description
    }
    $output += $CSVLine
    }

    $i++
    $ig++
    if ($i -ge 30000) {
        $output | Export-Csv -Path $Exportfile -NoTypeInformation -Delimiter ";" -Encoding UTF8 -Append
        $i = 0
        $output = @()

        $minutes = $stopwatch.elapsed.TotalMinutes
        $percentage = $ig / $totalrows * 100
        $totalestimatedtime = $minutes * (100/$percentage)
        $timeremaining = $totalestimatedtime - $minutes

        Write-Host "Events: Total minutes passed: $minutes. Total minutes remaining: $timeremaining. Percentage: $percentage"
    }
}

$output | Export-Csv -Path $Exportfile -NoTypeInformation -Delimiter ";" -Encoding UTF8 -Append
Write-Output $ig

$stopwatch.Stop()

Here is the structure of the JSON.

{
    "id": "111111111",
    "name": {
        "name": "Test Company GmbH",
        "legalForm": "GmbH"
    },
    "address": {
        "street": "Berlinstr.",
        "postalCode": "11111",
        "city": "Berlin"
    },
    "status": "liquidation",
    "events": {
        "items": [{
            "type": "Liquidation",
            "date": "2001-01-01",
            "description": "Liquidation"
        }, {
            "type": "NewCompany",
            "date": "2000-01-01",
            "description": "Neueintragung"
        }, {
            "type": "ControlChange",
            "date": "2002-01-01",
            "description": "Tested Company GmbH"
        }]
    },
    "relatedCompanies": {
        "items": [{
            "company": {
                "id": "2222222",
                "name": {
                    "name": "Test GmbH",
                    "legalForm": "GmbH"
                },
                "address": {
                    "city": "Berlin",
                    "country": "DE",
                    "formattedValue": "Berlin, Deutschland"
                },
                "status": "active"
            },
            "roles": [{
                "date": "2002-01-01",
                "name": "Komplementär",
                "type": "Komplementaer",
                "demotion": true,
                "group": "Control",
                "dir": "Source"
            }, {
                "date": "2001-01-01",
                "name": "Komplementär",
                "type": "Komplementaer",
                "group": "Control",
                "dir": "Source"
            }]
        }, {
            "company": {
                "id": "33333",
                "name": {
                    "name": "Test2 GmbH",
                    "legalForm": "GmbH"
                },
                "address": {
                    "city": "Berlin",
                    "country": "DE",
                    "formattedValue": "Berlin, Deutschland"
                },
                "status": "active"
            },
            "roles": [{
                "date": "2002-01-01",
                "name": "Komplementär",
                "type": "Komplementaer",
                "demotion": true,
                "group": "Control",
                "dir": "Source"
            }, {
                "date": "2001-01-01",
                "name": "Komplementär",
                "type": "Komplementaer",
                "group": "Control",
                "dir": "Source"
            }]
        }]
    }
}
  • Try to [avoid using the increase assignment operator (`+=`) to create a collection](https://stackoverflow.com/q/60708578/1701026) – iRon May 05 '20 at 12:29
  • I’m voting to close this question because it is more likely to get an answer on https://codereview.stackexchange.com/ – JosefZ May 05 '20 at 13:30

2 Answers2

3

as per comment: Try to avoid using the increase assignment operator (+=) to create a collection.
Use the PowerShell pipeline instead, e.g.:

$stopwatch =  [system.diagnostics.stopwatch]::StartNew()
$totalrows = 4000000

$encoding = [System.Text.Encoding]::UTF8    
$i = 0
$ig = 0

$Importfile = "C:\file.jsonl"
$Exportfile = "C:\file.csv"

if (test-path $Exportfile) {
    Remove-Item -path $Exportfile
}

Get-Content $Importfile -Encoding $encoding | Foreach-Object {
    $json = $_ | ConvertFrom-Json
    $json | ConvertFrom-Json | Foreach-Object {
        [pscustomobject]@{
            Key = $json.Register.Key
            CompanyID = $json.id
            Eventtype = $_.type
            Eventdate = $_.date
            Eventdescription = $_.description
        }
    }

    $i++
    $ig++
    if ($i -ge 30000) {
        $i = 0
        $minutes = $stopwatch.elapsed.TotalMinutes
        $percentage = $ig / $totalrows * 100
        $totalestimatedtime = $minutes * (100/$percentage)
        $timeremaining = $totalestimatedtime - $minutes

        Write-Host "Events: Total minutes passed: $minutes. Total minutes remaining: $timeremaining. Percentage: $percentage"
    }
} | Export-Csv -Path $Exportfile -NoTypeInformation -Delimiter ";" -Encoding UTF8 -Append
Write-Output $ig

$stopwatch.Stop()

Update 2020-05-07
Based on the comments and extra info the question, I have written a small reusable cmdlet that uses the PowerShell Pipeline to read through the .jsonl (Json Lines) file. It collects each line till it find a closing '}' character then it checks for a valid json string (using Test-Json as there might embedded objects. If it is valid it intermediately release the extract object in the pipeline and start collecting lines again:

Function ConvertFrom-JsonLines {
    [CmdletBinding()][OutputType([Object[]])]Param (
        [Parameter(ValueFromPipeLine = $True, Mandatory = $True)][String]$Line
    )
    Begin { $JsonLines = [System.Collections.Generic.List[String]]@() }
    Process {
        $JsonLines.Add($Line)
        If ( $Line.Trim().EndsWith('}') ) {
            $Json = $JsonLines -Join [Environment]::NewLine
            If ( Test-Json $Json -ErrorAction SilentlyContinue ) {
                $Json | ConvertFrom-Json
                $JsonLines.Clear()
            }
        }
    }
}

You can use it like this:

Get-Content .\file.jsonl | ConvertFrom-JsonLines | ForEach-Object { $_.events.items } |
Export-Csv -Path $Exportfile -NoTypeInformation -Encoding UTF8
iRon
  • 20,463
  • 10
  • 53
  • 79
  • What happens now is that the Exportfile is updated every row, which also takes a lot of time. Do you have a solutions to solve this too? Right know your script is only a little faster. – Martijn Olthuis May 05 '20 at 17:09
  • What the code has to do is iterate through every item in $json.events.items. That's why your code isn't working after all in my case. Do you have an answer which also solves my case? I'm don't get it to work using only pipeline. – Martijn Olthuis May 05 '20 at 17:30
  • Please explain what exactly is not working, how does the current output differ from the expected? – iRon May 05 '20 at 17:58
  • Thanks for taking the time to figure it out! It outputs an empty file at the moment, the problem lies in the $json variable, it works when not using $json. But I have to iterate over all the items inside a certain key(events) in the json, where it has to take some data out of another key and puts it. If you read my first script carefully you might understand, I don't know how to explain exactly. I was using: foreach `($item in $json.events.items)` as you see in my script. – Martijn Olthuis May 06 '20 at 08:36
  • The problem in this matter is that `$json = $line | ConvertFrom-Json` (which is also in your question) won't probably work because every (Json) object/property **structure**, *usually* expands over multiple lines (or does it actually concern a list of lines with `json` one-liners?). Can you add some parts of the Json file (the start and the part that includes the `.events.items` property) and add that to your question? – iRon May 06 '20 at 10:19
  • I added the JSON structure, maybe you can also find a solution for relatedcompanies.roles. I used an extra foreach loop in my current script, but when optimizing the script I don't get it to work. – Martijn Olthuis May 06 '20 at 11:46
  • How does this Json file end up to 4.000.000 lines? Does it repeat itself over and over again (with just different values)? If yes, is there a separator? or ... Btw the extension is `.jsonl` ([Json lines](http://jsonlines.org/examples/)?) with also suggest a non standard Json file... – iRon May 06 '20 at 15:15
  • You are right I'm using JSON Lines, I thought it was pretty much the same as JSON. On every line there is an entire JSON element, with different values. – Martijn Olthuis May 07 '20 at 10:20
  • 1
    I have update my answer with a reusable cmdlet for JSON Lines which is indeed not a format compatible with standard Json. As for a fact ,you can't use the `ConvertFrom-Json .\file.jsonl` command. – iRon May 07 '20 at 16:02
1

I am able to make it ~40 % faster by making two small changes: 1. use Get-Content -ReadCount and unpack the buffered lines and 2. change the pipeline to 'flow' more by avoiding the $json=+foreach part.

$stopwatch = [system.diagnostics.stopwatch]::StartNew()
$totalrows = 4000000

$encoding = [System.Text.Encoding]::UTF8
$i = 0
$ig = 0

$Importfile = "$psscriptroot\input2.jsonl"
$Exportfile = "$psscriptroot\output.csv"

if (Test-Path $Exportfile) {
  Remove-Item -Path $Exportfile
}
# Changed the next few lines
Get-Content $Importfile -Encoding $encoding -ReadCount 10000 |
  ForEach-Object {
    $_
  } | ConvertFrom-Json | ForEach-Object {
    $json = $_
    $json.events.items | ForEach-Object {
      [pscustomobject]@{
        Key              = $json.Register.Key
        CompanyID        = $json.id
        Eventtype        = $_.type
        Eventdate        = $_.date
        Eventdescription = $_.description
      }
    }

    $i++
    $ig++
    if ($i -ge 10000) {
      $i = 0
      $minutes = $stopwatch.elapsed.TotalMinutes
      $percentage = $ig / $totalrows * 100
      $totalestimatedtime = $minutes * (100 / $percentage)
      $timeremaining = $totalestimatedtime - $minutes

      Write-Host "Events: Total minutes passed: $minutes. Total minutes remaining: $timeremaining. Percentage: $percentage"
    }
  } | Export-Csv -Path $Exportfile -NoTypeInformation -Delimiter ';' -Encoding UTF8 -Append
Write-Output $ig

$stopwatch.Stop()
perost
  • 11
  • 1