I ordered a list of genes by P-value in the form of a top.genes.matrix
expression matrix. The dat.filtered
dataframe is in log2 scale, which is filtered from the original dat
dataframe.
Most of my top genes have infinite values. Is there an error in my code? How should I treat the infinite values? Remove it?
# Convert dat to numeric matrix
samp.matrix <- data.matrix(dat[, (3:ncol(dat))])
# Correlation Matrix
dat.cor <- cor(samp.matrix, method="pearson", use="pairwise.complete.obs")
# Average correlation plot
dat.avg <- apply(dat.cor, 1, mean, na.rm=T)
# Remove Outlier(s)
samp.matrix <- samp.matrix[, -(grep(names(outlier), colnames(samp.matrix)))]
# Eliminate probes with rowMeans less than 0 on a log2 scale
dat.fil <- subset(samp.matrix, log2(rowMeans(samp.matrix)) > 0)
removed <- nrow(samp.matrix) - nrow(dat.fil)
# Eliminate probes with rowMeans less than 3 on a log2 scale
dat.filtered <- subset(dat.fil, rowMeans(dat.fil) > 3)
dat.filtered <- as.data.frame(dat.filtered)
removed <- nrow(dat.fil) - nrow(dat.filtered)
# Student's t-test function
rawp <- c()
t.test.all.genes <- function(x,s1,s2) {
x <- as.numeric(x)
x1 <- x[s1]
x2 <- x[s2]
x1 <- as.numeric(x1)
x2 <- as.numeric(x2)
t.out <- t.test(x1,x2, alternative="two.sided",var.equal=T)
out <- as.numeric(t.out$p.value)
return(out)
}
# Raw p-value for ontrol and glioma samples
rawp <- apply(dat.filtered, 1, t.test.all.genes, s1=is.ctl, s2=is.glioma)
# Ordering the highest genes (by P-value) in the form of a data.frame
# Note: dat.filtered is still in log2 scale
best.genes <- order(rawp)[1:length(rawp)]
best.genes.df <- data.frame(index=best.genes, exp=2^dat.filtered[best.genes, ], rawp=rawp[best.genes])
# Expression matrix with the 'best.genes' in the original scale (based on P-value ranking)
top.genes.matrix <- 2^dat.filtered[best.genes, ]
> dput(dat[1:5, 1:5])
structure(list(ID_REF = c("1007_s_at", "1053_at", "117_at", "121_at",
"1255_g_at"), IDENTIFIER = c("MIR4640", "RFC2", "HSPA6", "PAX8",
"GUCA1A"), NB_GSM97800 = c(4701.5, 282.7, 769.6, 1616.3, 232.7
), NB_GSM97803 = c(4735, 347.9, 287.9, 1527.2, 204.8), NB_GSM97804 = c(2863.9,
355, 199, 1793.8, 119.3)), row.names = c(NA, 5L), class = "data.frame")
> dput(dat.filtered[1:5, 1:5])
structure(list(NB_GSM97800 = c(4701.5, 282.7, 769.6, 1616.3,
232.7), NB_GSM97803 = c(4735, 347.9, 287.9, 1527.2, 204.8), NB_GSM97804 = c(2863.9,
355, 199, 1793.8, 119.3), NB_GSM97805 = c(5350.2, 319.9, 182.8,
1880, 180.2), NB_GSM97807 = c(4789.4, 294.2, 204.3, 1012, 156.7
)), row.names = c("1007_s_at", "1053_at", "117_at", "121_at",
"1255_g_at"), class = "data.frame")
> > head(rawp) 1007_s_at 1053_at 117_at 121_at 1255_g_at 1294_at
> 6.631006e-14 1.111397e-05 5.772347e-02 9.229543e-02 7.378613e-24 1.027390e-05
>
> dput(best.genes.df[1:5, 1:5])
structure(list(index = c(29035L, 49756L, 23236L, 49011L, 15902L
), exp.NB_GSM97800 = c(960, 819.6, 3156.3, 1809.7, 2540.5), exp.NB_GSM97803 = c(1509.1,
732.7, 4206.9, 1851.2, 3455.3), exp.NB_GSM97804 = c(1121.2, 138.3,
3442.1, 645.4, 1764.5), exp.NB_GSM97805 = c(1365.6, 431.6, 2783.3,
1261.8, 1990.1)), row.names = c("219752_at", "240512_x_at", "213938_at",
"239767_at", "206456_at"), class = "data.frame")
>
> dput(top.genes.matrix[1:5, 1:5])
structure(list(NB_GSM97800 = c(9.7453140114e+288, 5.29888441356601e+246,
Inf, Inf, Inf), NB_GSM97803 = c(Inf, 3.67009936848685e+220, Inf,
Inf, Inf), NB_GSM97804 = c(Inf, 4.2899121663437e+41, Inf, 1.92645647596287e+194,
Inf), NB_GSM97805 = c(Inf, 8.40516277768061e+129, Inf, Inf, Inf
), NB_GSM97807 = c(Inf, 5.18689446110124e+179, Inf, Inf, Inf)), row.names = c("219752_at",
"240512_x_at", "213938_at", "239767_at", "206456_at"), class = "data.frame")