Comparing two files based on multiple field using awk ( or may be python)

Question

I want to compare two files and display the differences and the missing records in both files. Based on suggestions on this forum, I found awk is the fastest way to do it.

Comparison is to be done based on composite key - match_key and issuer_grid_id

Code:

BEGIN { FS="[= ]" }
{
    match(" "$0,/ match_key="[^"]+"/)
    key = substr($0,RSTART,RLENGTH)
}
NR==FNR {
    file1[key] = $0
    next
}
{
    if ( key in file1 ) {
        nf = split(file1[key],tmp)
        for (i=1; i<nf; i+=2) {
            f1[key,tmp[i]] = tmp[i+1]
        }

        msg = sep = ""
        for (i=1; i<NF; i+=2) {
            if ( $(i+1) != f1[key,$i] ) {
                msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
                sep = ","
            }
        }
        if ( msg != "" ) {
            print "Mismatch in row " FNR msg
        }
        delete file1[key]
    }
    else {
        file2[key] = $0
    }
}
END {
    for (key in file1) {
        print "In file1 only:", key, file1[key]
    }
    for (key in file2) {
        print "In file2 only:", key, file2[key]
    }
}

file1:

period="2021-02-28" book_base_ent_cd="U0028" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="USD" issuer_grid_id="2" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="3" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA22"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"

file2:

period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="3" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA23"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"

file 3 (it has only one row but number of fields are more)

period="2021-02-28" book_base_ent_cd="U0027" other_inst_ident="PLCHS258Q463" rep_nom_curr="PLN" reporting_basis="Unit" src_instr_class="Debt" mat_date="2026-08-25" nom_curr="PLN" primary_asset_class="Bond" seniority_type="931" security_status="alive" issuer_name="CUST38677608" intra_group_prud_scope="Issuer is not part of the reporting group" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_frbrnc_stts="NOFRBRNRNGT" prfrmng_stts="Performing" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" src_imprmnt_assssmnt_mthd="COLLECTIVE" accmltd_imprmnt="78.54" accmltd_chngs_fv_cr="0" expsr_vl="0" unit_measure="EUR" unit_measure_nv="EUR" crryng_amnt="24565.13" issuer_grid_id="38677608" match_key="PLCHS258Q463"

Expected output:

In file1 only : issuer_grid_id="2" match_key="PLCHS252SA22"
In file2 only : issuer_grid_id="2" match_key="PLCHS252SA23"

Mismatch for issuer_grid_id="2" match_key="PLCHS252SA20" : file1.book_base_ent_cd="U0028" file2.book_base_ent_cd="U0027", file1.unit_measure="USD" file2.unit_measure="EUR"

Actual Output

awk -f compare.awk file1 file2
Mismatch in row 1 for    file1.issuer_grid_id="2" file2.issuer_grid_id="3", file1.match_key="PLCHS252SA21" file2.match_key="PLCHS252SA20"
In file2 only:  period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"

I am not able to find a way to do the multifield comparison?

Any suggestion is appreciated. I tagged python too, if any way to do it in faster way in it.

Best Regards.

score 2 · Accepted Answer · answered Feb 03 '22 at 13:48

Just tweak the setting of key at the top to use whatever set of fields you want, and the printing of the mismatch message to be from key ... key instead of from line ... FNR:

$ cat tst.awk
BEGIN { FS="[= ]" }
{
    match(" "$0,/ issuer_grid_id="[^"]+"/)
    key = substr($0,RSTART,RLENGTH)
    match(" "$0,/ match_key="[^"]+"/)
    key = key substr($0,RSTART,RLENGTH)
}
NR==FNR {
    file1[key] = $0
    next
}
{
    if ( key in file1 ) {
        nf = split(file1[key],tmp)
        for (i=1; i<nf; i+=2) {
            f1[key,tmp[i]] = tmp[i+1]
        }

        msg = sep = ""
        for (i=1; i<NF; i+=2) {
            if ( $(i+1) != f1[key,$i] ) {
                msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
                sep = ","
            }
        }
        if ( msg != "" ) {
            print "Mismatch for key " key msg
        }
        delete file1[key]
    }
    else {
        file2[key] = $0
    }
}
END {
    for (key in file1) {
        print "In file1 only:", key, file1[key]
    }
    for (key in file2) {
        print "In file2 only:", key, file2[key]
    }
}

$ awk -f tst.awk file1 file2
Mismatch for key issuer_grid_id="2" match_key="PLCHS252SA20" file1.book_base_ent_cd="U0028" file2.book_base_ent_cd="U0027", file1.unit_measure="USD" file2.unit_measure="EUR"
In file1 only: issuer_grid_id="2" match_key="PLCHS252SA22" period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA22"
In file2 only: issuer_grid_id="2" match_key="PLCHS252SA23" period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA23"

Thanks Ed. Does it matter if number of fields in a row increases in a file? I just changed the input to the script ( which can be seen in file 3). I copied content of file 3 to file1 and file2. So basically both the files are same now, and unix diff command confirms it. But awk comparison shows a difference. Can you please try at your end please? The weird thing is it takes a value and shows that as field in output. I think somewhere in script we need to take care of space in the input field values. — Neha, Feb 03 '22 at 14:53
The number of fields doesn't matter but now you're introducing spaces into the quoted fields and we're using spaces and `=` as the field separator in the code so as I said at the top of my first answer, [assuming that, like in the example you provided, your quoted strings don't contain blanks, =, or "](https://stackoverflow.com/a/70681672/1745001), i.e. the code isn't designed to handle that. Ask a new question about how to handle spaces and/or `=` and/or `"` in quoted fields as opposed to this one about how to handle multi-field keys. — Ed Morton, Feb 03 '22 at 15:03
Sure I'll ask another one. But for my understanding how is this input differs from original input ? In that also we have fields like intra_group_acc_scope, frbrnc_stts, dflt_stts_issr etc which have spaces inside quoted strings. — Neha, Feb 03 '22 at 15:04
By "original input" if you mean your previous questions that the code in your question was designed to handle, that input had no fields with spaces. If you mean the input in THIS question, your input's just too lengthy to notice changes like that, you'd have to give a heads up to look for it. Good sample input/output is concise and doesn't require a scroll bar to see for just that reason. So your file3 input isn't different from your file1 and file2 input in this question, but your question is about how to modify existing code to handle multi-field keys, not how to handle fields with spaces. — Ed Morton, Feb 03 '22 at 15:09

dawg · Answer 2 · 2022-02-04T13:07:22.597

You can use ruby sets:

$ cat tst.rb
def f2h(fn) 
    data={}
    File.open(fn){|fh| fh.
        each_line{|line| h=line.scan(/(\w+)="([^"]+)"/).to_h
            k=h.slice("issuer_grid_id", "match_key").
            map{|k,v| "#{k}=#{v}"}.join(", ")
            data[k]=h}
    }
    data
end

f1=f2h(ARGV[0])
f2=f2h(ARGV[1])

mis=Hash.new { |hash, key| hash[key] = [] }
(f2.keys & f1.keys).each{|k| 
    f1[k].each{|ks,v| 
        template="#{ks}: #{ARGV[0]}.#{f1[k][ks]}, #{ARGV[1]}.#{f2[k][ks]}"
        mis[k] << template if f1[k][ks]!=f2[k][ks]}} 

mis.each{|k,v| puts "Mismatch for key #{k} #{v.join(" ")}"}

f1only=(f1.keys-f2.keys).join(", ") 
f2only=(f2.keys-f1.keys).join(", ")  
puts "Only in #{ARGV[0]}: #{f1only}\nOnly in #{ARGV[1]}: #{f2only}"

Then calling like so:

ruby tst.rb  file1 file2

Prints:

Mismatch for key issuer_grid_id=2, match_key=PLCHS252SA20 book_base_ent_cd: file1.U0028, file2.U0027 unit_measure: file1.USD, file2.EUR
Only in file1: issuer_grid_id=2, match_key=PLCHS252SA22
Only in file2: issuer_grid_id=2, match_key=PLCHS252SA23

(If you want quotes around the values, they are easily added.)

It works because ruby support set arithmetic on arrays (this is from the ruby interactive shell):

irb(main):033:0> arr1=[1,2,3,4]
=> [1, 2, 3, 4]
irb(main):034:0> arr2=[2,3,4,5]
=> [2, 3, 4, 5]
irb(main):035:0> arr1-arr2
=> [1]       # only in arr1
irb(main):036:0> arr2-arr1
=> [5]       # only in arr2
irb(main):037:0> arr1 & arr2
=> [2, 3, 4] # common between arr1 and arr2

Since we are using (f2.keys & f1.keys) we are guaranteed to only be looping over shared keys. It therefore works just fine with your example file3:

$ ruby tst.rb  file1 file3
Only in file1: issuer_grid_id=2, match_key=PLCHS252SA20, issuer_grid_id=3, match_key=PLCHS252SA20, issuer_grid_id=2, match_key=PLCHS252SA22, issuer_grid_id=2, match_key=PLCHS252SA21
Only in file3: issuer_grid_id=38677608, match_key=PLCHS258Q463

Since Python also has sets, this is easily written in Python too:

import re 

def f2h(fn):
    di={}
    k1, k2="issuer_grid_id", "match_key"
    with open(fn) as f:
        for line in f:
            matches=dict(re.findall(r'(\w+)="([^"]+)"', line))
            di[f"{k1}={matches[k1]}, {k2}={matches[k2]}"]=matches
    return di

f1=f2h(fn1)
f2=f2h(fn2)

mis={}
for k in set(f1.keys()) & set(f2.keys()):
    for ks,v in f1[k].items():
        if f1[k][ks]!=f2[k][ks]:
            mis.setdefault(k, []).append(
               f"{ks}: {fn1}.{f1[k][ks]}, {fn2}.{f2[k][ks]}")
            
for k,v in mis.items():
    print(f"Mismatch for key {k} {' '.join(v)}")
    
print(f"Only in {fn1}: {';'.join(set(f1.keys())-f2.keys())}")
print(f"Only in {fn2}: {';'.join(set(f2.keys())-f1.keys())}")

While awk does not support sets, the set operations and and minus are trivial to write with associative arrays. Which then allows a GNU awk version of this same method:

function set_and(a1, a2, a3) {
    delete a3
    for (e in a1) if (e in a2) a3[e]
}

function set_minus(a1, a2, a3) {
    delete a3
    for (e in a1) if (!(e in a2)) a3[e]
}

function proc_line(s, data) {
    delete data
    # this is the only GNU specific portion and easily rewritten for POSIX
    patsplit(s,matches,/\w+="[^"]+"/)
    for (m in matches) {
        split(matches[m],kv, /=/)
        data[kv[1]]=kv[2]
    }   
}

{
    proc_line($0, data) 
    key=sprintf("issuer_grid_id=%s, match_key=%s", 
        data["issuer_grid_id"], data["match_key"])
}

FNR==NR{a1[key]=$0}
FNR<NR{a2[key]=$0}

END{
    set_and(a1,a2, a3) 
    for (key in a3) {
        ft=sprintf("Mismatch for key %s ", key)
        proc_line(a1[key],d1)
        proc_line(a2[key],d2)
        for (sk in d1) if (d1[sk]!=d2[sk]) {
                printf("%s %s %s.%s; %s.%s", ft, sk, ARGV[1], d1[sk], ARGV[2], d2[sk])
                ft=""
                }
        if (ft=="") print ""
    }

    set_minus(a1,a2, a3)
    for (e in a3) printf("In %s only: %s\n", ARGV[1], e)

    set_minus(a2,a1, a3)
    for (e in a3) printf("In %s only: %s\n", ARGV[2], e)
}

This works the same as the Ruby and Python version and also supports the third file example.

Good luck!

Comparing two files based on multiple field using awk ( or may be python)

2 Answers2