I have large tsv files (>1gb each) in a directory and I need to split each file into 80/20 split. per my limited knowledge on power shell I did below but its hell slow. I know I can do this in milliseconds with cygwin /bash but I need to automate this process through batch files. I am sure there is better and faster solution to this.
$DataSourceFolder="D:\Data"
$files = Get-ChildItem "$DataSourceFolder" -Filter *".tsv"
foreach ($file in $files)
{
$outputTrainfile="$DataSourceFolder\partitions\"+ $file.BaseName + "-train.tsv"
$outputTestfile="$DataSourceFolder\partitions\"+ $file.BaseName + "-test.tsv"
$filepath = "$DataSourceFolder\"+ $file
# Get number of rows in the file
Get-Content $filepath | Measure-Object | ForEach-Object { $sourcelinecount = $_.Count }
# Get top and tail count to be fetched from source file
$headlinecount = ($sourcelinecount * 80) /100
$taillinecount = $sourcelinecount - $headlinecount
# Create the files
New-Item -ItemType file $outputTrainfile -force
New-Item -ItemType file $outputTestfile -force
#set content to the files
Get-Content $filepath -TotalCount $headlinecount | Set-Content $outputTrainfile
Get-Content $filepath -Tail $taillinecount | Set-Content $outputTestfile
}