I'm trying to scan an element in a .csv file, for example "NP_000005.3", within the .faa file, and combine the matches into the table with the relevant information.
The data:
> head(data1_pt)
X.Name Accession Start Stop Strand GeneID Locus Locus.tag Protein.product Length
1 chromosome 5 NC_000005.10 92232 182323 + 153478 PLEKHG4B - NP_443141.4 1627
2 chromosome 5 NC_000005.10 191539 195353 + 389257 LRRC14B - NP_001073947.1 514
3 chromosome 5 NC_000005.10 205297 216849 - 133957 CCDC127 - NP_660308.1 260
4 chromosome 5 NC_000005.10 218356 263443 + 6389 SDHA - XP_011512374.1 680
5 chromosome 5 NC_000005.10 218356 263443 + 6389 SDHA - XP_011512375.1 599
6 chromosome 5 NC_000005.10 218356 263443 + 6389 SDHA - XP_047273423.1 632
Protein.Name
1 pleckstrin homology domain-containing family G member 4B
2 leucine-rich repeat-containing protein 14B
3 coiled-coil domain-containing protein 127
4 succinate dehydrogenase
5 succinate dehydrogenase
6 succinate dehydrogenase
library(seqinr)
data2_asfaa <- read.fasta("GCF_000001405.40_GRCh38.p14_protein.faa")
> summary(data2_asfaa)
Length Class Mode
NP_000005.3 1474 SeqFastadna character
NP_000006.2 290 SeqFastadna character
NP_000007.1 421 SeqFastadna character
NP_000008.1 412 SeqFastadna character
NP_000009.1 655 SeqFastadna character
NP_000010.1 427 SeqFastadna character
NP_000011.2 503 SeqFastadna character
NP_000012.1 467 SeqFastadna character
NP_000013.2 363 SeqFastadna character
NP_000014.1 387 SeqFastadna character
NP_000015.2 413 SeqFastadna character
NP_000016.1 408 SeqFastadna character
NP_000017.1 484 SeqFastadna character
NP_000018.2 346 SeqFastadna character
NP_000019.2 1532 SeqFastadna character
> head(data2_asfaa)
$NP_000005.3
[1] "m" "g" "k" "n" "k" "l" "l" "h" "p" "s" "l" "v" "l" "l" "l" "l" "v" "l" "l" "p" "t" "d" "a" "s" "v" "s" "g" "k"
[29] "p" "q" "y" "m" "v" "l" "v" "p" "s" "l" "l" "h" "t" "e" "t" "t" "e" "k" "g" "c" "v" "l" "l" "s" "y" "l" "n" "e"
[57] "t" "v" "t" "v" "s" "a" "s" "l" "e" "s" "v" "r" "g" "n" "r" "s" "l" "f" "t" "d" "l" "e" "a" "e" "n" "d" "v" "l"
[85] "h" "c" "v" "a" "f" "a" "v" "p" "k" "s" "s" "s" "n" "e" "e" "v" "m" "f" "l" "t" "v" "q" "v" "k" "g" "p" "t" "q"
[113] "e" "f" "k" "k" "r" "t" "t" "v" "m" "v" "k" "n" "e" "d" "s" "l" "v" "f" "v" "q" "t" "d" "k" "s" "i" "y" "k" "p"
[141] "g" "q" "t" "v" "k" "f" "r" "v" "v" "s" "m" "d" "e" "n" "f" "h" "p" "l" "n" "e" "l" "i" "p" "l" "v" "y" "i" "q"
[169] "d" "p" "k" "g" "n" "r" "i" "a" "q" "w" "q" "s" "f" "q" "l" "e" "g" "g" "l" "k" "q" "f" "s" "f" "p" "l" "s" "s"
[ reached getOption("max.print") -- omitted 474 entries ]
attr(,"name")
[1] "NP_000005.3"
attr(,"Annot")
[1] ">NP_000005.3 alpha-2-macroglobulin isoform a precursor [Homo sapiens]"
attr(,"class")
[1] "SeqFastadna"
My solution:
thing = ""
for (i in length(data2_asfaa)){
thing <- data1_pt == data2_asfaa& #can't write a column name because its not tabular
And also:
positions <- c(which(sapply(pproducts, list(data2_asfaa), pproducts %in% data2_asfaa)))