I have a data frame from TCGAbiolinks and need to narrow it down to just unnormalized data. I tried writing some sort of for loop that will return the rows where the tags variable in subset.gbmexp includes "unnormalized" but can't seem to get the code right. Something along the lines of:
for (x in subset.gbmexp){
if (grep("unnormalized", x)){
data.frame(x)
}
}
Here is subset.gbmexp:
> dput(head(subset.gbmexp))
structure(list(file_state = c("submitted", "submitted", "submitted",
"submitted", "submitted", "submitted"), updated_datetime = c("2017-03-04T20:47:52.066809-06:00",
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T18:29:39.863030-06:00",
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T12:32:21.948139-06:00",
"2017-03-05T11:18:26.999142-06:00"), state = c("live", "live",
"live", "live", "live", "live"), data_category = c("Gene expression",
"Gene expression", "Gene expression", "Gene expression", "Gene expression",
"Gene expression"), version = c("1", "1", "1", "1", "1", "1"),
file_size = c(1513300L, 1518638L, 1518861L, 436273L, 436814L,
1500084L), data_release = c("0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0",
"0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0"), submitter_id = c(NA,
NA, NA, NA, NA, NA), access = c("open", "open", "open", "open",
"open", "open"), data_format = c("TXT", "TXT", "TXT", "TXT",
"TXT", "TXT"), id = c("c76037b3-200b-42ea-a935-7e27b94609be",
"81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1",
"954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b",
"99077392-c937-4814-9a39-02787ad04ed9"), data_type = c("Gene expression quantification",
"Gene expression quantification", "Gene expression quantification",
"Gene expression quantification", "Gene expression quantification",
"Gene expression quantification"), type = c("file", "file",
"file", "file", "file", "file"), cases = c("TCGA-06-0184-01A-01R-1849-01",
"TCGA-06-0649-01B-01R-1849-01", "TCGA-02-2485-01A-01R-1849-01",
"TCGA-28-1753-01A-01R-1850-01", "TCGA-06-0680-11A-32R-A36H-07",
"TCGA-26-5136-01B-01R-1850-01"), file_id = c("c76037b3-200b-42ea-a935-7e27b94609be",
"81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1",
"954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b",
"99077392-c937-4814-9a39-02787ad04ed9"), experimental_strategy = c("RNA-Seq",
"RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq"), md5sum = c("446a53abb4957c031d98c5c5d8b0d389",
"7856260846fba1d83842bf6c28856eaf", "8e8f5d50fa60195f3c5d1c4b6986e232",
"a1379ab262859850649051b2df076fec", "f3cd6c2c8616ac3d89e07d5281eddc49",
"5c8a99cb4bbbd83b3bb24f49b5f5cb23"), tags = list(c("unnormalized",
"gene", "v2"), c("unnormalized", "gene", "v2"), c("unnormalized",
"gene", "v2"), c("normalized", "gene", "v2"), c("gene", "normalized",
"v2"), c("unnormalized", "gene", "v2")), platform = c("Illumina HiSeq",
"Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq",
"Illumina HiSeq"), state_comment = c(NA, NA, NA, NA, NA,
NA), file_name = c("unc.edu.7522ddf3-0d35-4085-9f94-1ca2e38aa804.1541218.rsem.genes.results",
"unc.edu.acb1160e-036a-4108-a9ce-d5f954191593.1538764.rsem.genes.results",
"unc.edu.102a0737-7d27-46b8-a433-4f1bb5300858.1545049.rsem.genes.results",
"unc.edu.23b23702-8e0f-4b4c-ad92-ce7ea44939e6.1544065.rsem.genes.normalized_results",
"unc.edu.94f66829-3cef-4af2-9f97-2352ac85efee.2403684.rsem.genes.normalized_results",
"unc.edu.39b5a7b5-e2ec-442d-94c4-ba938ee79b97.1542432.rsem.genes.results"
), project = c("TCGA-GBM", "TCGA-GBM", "TCGA-GBM", "TCGA-GBM",
"TCGA-GBM", "TCGA-GBM"), center_id = c("ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c"), center_center_type = c("CGCC",
"CGCC", "CGCC", "CGCC", "CGCC", "CGCC"), center_code = c("07",
"07", "07", "07", "07", "07"), center_name = c("University of North Carolina",
"University of North Carolina", "University of North Carolina",
"University of North Carolina", "University of North Carolina",
"University of North Carolina"), center_namespace = c("unc.edu",
"unc.edu", "unc.edu", "unc.edu", "unc.edu", "unc.edu"), center_short_name = c("UNC",
"UNC", "UNC", "UNC", "UNC", "UNC"), sample_type = c("Primary Tumor",
"Primary Tumor", "Primary Tumor", "Primary Tumor", "Solid Tissue Normal",
"Primary Tumor"), is_ffpe = c(FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE), cases.submitter_id = c("TCGA-06-0184", "TCGA-06-0649",
"TCGA-02-2485", "TCGA-28-1753", "TCGA-06-0680", "TCGA-26-5136"
), sample.submitter_id = c("TCGA-06-0184-01A", "TCGA-06-0649-01B",
"TCGA-02-2485-01A", "TCGA-28-1753-01A", "TCGA-06-0680-11A",
"TCGA-26-5136-01B")), row.names = c(NA, 6L), class = "data.frame")
However, this will only return the value in the first column of each selected row, where I want to build a data.frame including the entire row for each "unnormalized" value. Can anybody explain how I need to rewrite this? Thank you!