0

I'm trying to scan an element in a .csv file, for example "NP_000005.3", within the .faa file, and combine the matches into the table with the relevant information.

The data:

> head(data1_pt)
        X.Name    Accession  Start   Stop Strand GeneID    Locus Locus.tag Protein.product Length
1 chromosome 5 NC_000005.10  92232 182323      + 153478 PLEKHG4B         -     NP_443141.4   1627
2 chromosome 5 NC_000005.10 191539 195353      + 389257  LRRC14B         -  NP_001073947.1    514
3 chromosome 5 NC_000005.10 205297 216849      - 133957  CCDC127         -     NP_660308.1    260
4 chromosome 5 NC_000005.10 218356 263443      +   6389     SDHA         -  XP_011512374.1    680
5 chromosome 5 NC_000005.10 218356 263443      +   6389     SDHA         -  XP_011512375.1    599
6 chromosome 5 NC_000005.10 218356 263443      +   6389     SDHA         -  XP_047273423.1    632
                                              Protein.Name
1 pleckstrin homology domain-containing family G member 4B
2               leucine-rich repeat-containing protein 14B
3                coiled-coil domain-containing protein 127
4                                  succinate dehydrogenase
5                                  succinate dehydrogenase
6                                  succinate dehydrogenase

library(seqinr)
data2_asfaa <- read.fasta("GCF_000001405.40_GRCh38.p14_protein.faa")
> summary(data2_asfaa)
               Length Class       Mode     
NP_000005.3     1474  SeqFastadna character
NP_000006.2      290  SeqFastadna character
NP_000007.1      421  SeqFastadna character
NP_000008.1      412  SeqFastadna character
NP_000009.1      655  SeqFastadna character
NP_000010.1      427  SeqFastadna character
NP_000011.2      503  SeqFastadna character
NP_000012.1      467  SeqFastadna character
NP_000013.2      363  SeqFastadna character
NP_000014.1      387  SeqFastadna character
NP_000015.2      413  SeqFastadna character
NP_000016.1      408  SeqFastadna character
NP_000017.1      484  SeqFastadna character
NP_000018.2      346  SeqFastadna character
NP_000019.2     1532  SeqFastadna character


> head(data2_asfaa)
$NP_000005.3
   [1] "m" "g" "k" "n" "k" "l" "l" "h" "p" "s" "l" "v" "l" "l" "l" "l" "v" "l" "l" "p" "t" "d" "a" "s" "v" "s" "g" "k"
  [29] "p" "q" "y" "m" "v" "l" "v" "p" "s" "l" "l" "h" "t" "e" "t" "t" "e" "k" "g" "c" "v" "l" "l" "s" "y" "l" "n" "e"
  [57] "t" "v" "t" "v" "s" "a" "s" "l" "e" "s" "v" "r" "g" "n" "r" "s" "l" "f" "t" "d" "l" "e" "a" "e" "n" "d" "v" "l"
  [85] "h" "c" "v" "a" "f" "a" "v" "p" "k" "s" "s" "s" "n" "e" "e" "v" "m" "f" "l" "t" "v" "q" "v" "k" "g" "p" "t" "q"
 [113] "e" "f" "k" "k" "r" "t" "t" "v" "m" "v" "k" "n" "e" "d" "s" "l" "v" "f" "v" "q" "t" "d" "k" "s" "i" "y" "k" "p"
 [141] "g" "q" "t" "v" "k" "f" "r" "v" "v" "s" "m" "d" "e" "n" "f" "h" "p" "l" "n" "e" "l" "i" "p" "l" "v" "y" "i" "q"
 [169] "d" "p" "k" "g" "n" "r" "i" "a" "q" "w" "q" "s" "f" "q" "l" "e" "g" "g" "l" "k" "q" "f" "s" "f" "p" "l" "s" "s"
   [ reached getOption("max.print") -- omitted 474 entries ]
    attr(,"name")
    [1] "NP_000005.3"
    attr(,"Annot")
    [1] ">NP_000005.3 alpha-2-macroglobulin isoform a precursor [Homo sapiens]"
    attr(,"class")
    [1] "SeqFastadna"

My solution:

thing = ""
for (i in length(data2_asfaa)){
  thing <- data1_pt == data2_asfaa& #can't write a column name because its not tabular

And also:

positions <- c(which(sapply(pproducts, list(data2_asfaa), pproducts %in% data2_asfaa)))
Jilber Urbina
  • 58,147
  • 10
  • 114
  • 138
Ayaz
  • 1
  • 2
  • 1
    Could you please provide a reproducible data? You can find instructions here https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example of how to provide a reproducable data – ssaha Nov 30 '22 at 15:18
  • https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_protein.faa.gz – Ayaz Nov 30 '22 at 15:29

0 Answers0