-1

I have a big data file (not csv) with many columns with a header row. The column headers are strings containing letters and numbers. I would like to write a script that extracts the data columns based on their header, if the header is present in a second file. I have researched this question, and wrote a script adapted from an answer found at AWK extract columns from file based on header selected from 2nd file. I understand a good part of what it does, but I'll admit that I don't understand it completely. I am aware that it was designed for a csv file... I tried using it with my files, but I cannot get it to work. Here is the code (contained in a bash script):

(note: $motif_list and $affinity_matrix are the paths to both files and have been previously defined in the bash script)

 43 awk -v motif_list="$motif_list" -v affinity_matrix="$affinity_matrix" '
 44         BEGIN {
 45                 j=1
 46                 while ((getline < motif_list) > 0)
 47                 {
 48                         col_names[j++] = $1
 49                 }
 50                 n=j-1;
 51                 close(motif_list)
 52                 for (i=1; i<=n; i++) s[col_names[i]] = i
 53               }       
 54 
 55         NR==1 {
 56                 for (f=1; f<=NF; f++)
 57                         if ($f in s) c[s[$f]]=f
 58                 next
 59               }
 60 
 61         {
 62                 sep=" "
 63                 for (f=1; f<=n; f++)
 64                         {
 65                                 printf("%c%s",sep,$c[f])
 66                                 sep=FS
 67                         }
 68                 print " "
 69         }' "$affinity_matrix" > $affinity_columns

(I also changed the separator from "" to " ", but that might not be the right way to do it)

As an example, here are sample input and output tables:

Input:

A   B   C   D   E   F
1   2   3   4   5   6
1   2   3   4   5   6
1   2   3   4   5   6
1   2   3   4   5   6
1   2   3   4   5   6

Output:

A   C   
1   3   
1   3   
1   3   
1   3   
1   3   

Any input would be much appreciated!

Thanks

Community
  • 1
  • 1
arielle
  • 915
  • 1
  • 12
  • 29

1 Answers1

1

The general approach (untested since you didn't provide any sample input/output) is:

awk '
NR==FNR { names[$0]; next }
FNR==1 {
    for (i=1;i<=NF;i++) {
        if ($i in names) {
            nrs[i]
        }
    }
}
{
    c = 0
    for (i=1;i<=NF;i++) {
        if (i in nrs) {
            printf "%s%s", (c++ ? OFS : ""), $i
        }
    }
    if (c) {
        print ""
    }
}
' motif_list affinity_matrix
Ed Morton
  • 188,023
  • 17
  • 78
  • 185