BEGIN {
split(w, weight)
total = 0
for (i in weight) {
weight[i] += total
total = weight[i]
}
}
FNR == 1 {
if (NR!=1) {
write_partitioned_files(weight,a)
split("",a,":") #empty a portably
}
name=FILENAME
}
{a[FNR]=$0}
END {
write_partitioned_files(weight,a)
}
function write_partitioned_files(weight, a) {
split("",threshold,":")
size = length(a)
for (i in weight){
threshold[length(threshold)] = int((size * weight[i] / total)+0.5)+1
}
l=1
part=0
for (i in threshold) {
close(out)
out = name ".part" ++part
for (;l<threshold[i];l++) {
print a[l] " > " out
}
}
}
Invoke as:
awk -v w="60 20 20" -f above_script.awk file_to_split1 file_to_split2 ...
Replace " > "
with >
in script to actually write partitioned files.
The variable w
expects space separated numbers. Files are partitioned in that proportion. For example "2 1 1 3"
will partition files into four with number of lines in proportion of 2:1:1:3. Any sequence of numbers adding up to 100 can be used as percentages.
For large files the array a
may consume too much memory. If that is an issue, here is an alternative awk
script:
BEGIN {
split(w, weight)
for (i in weight) {
total += weight[i]; weight[i] = total #cumulative sum
}
}
FNR == 1 {
#get number of lines. take care of single quotes in filename.
name = gensub("'", "'\"'\"'", "g", FILENAME)
"wc -l '" name "'" | getline size
split("", threshold, ":")
for (i in weight){
threshold[length(threshold)+1] = int((size * weight[i] / total)+0.5)+1
}
part=1; close(out); out = FILENAME ".part" part
}
{
if(FNR>=threshold[part]) {
close(out); out = FILENAME ".part" ++part
}
print $0 " > " out
}
This passes through each file twice. Once for counting lines (via wc -l
) and the other time while writing partitioned files. Invocation and effect is similar to the first method.