I'm importing a data file from a Scientific instrument and initially it produces results in a 6 line format and we later change it to a 2 line format with a different header.
It does work just fine when it finds both headers. However, due to being quick we changed the output format to the 2 lines before it wrote the first results. We now get the error message in the batch window: Error in readQuat(file) : object 'head_ln' no found
My question is: what would be the expression to skip this part and carry on with the next object? In our case then the 2 line format.I'd like to emphasize that the script has to work for the cases: 6 line format followed by 2 line format and 2 line format only.
My code is
# Title:
# Purpose:
# Author:
#needed Libraries
library(plyr)
# Convert 100:01.782 (M:S.S) to 100.0297 (M.M) or decimal minutes
MS <- function(x, units="mins") {
f <- function(...) {
r <- cbind(...)
return(
as.difftime(as.numeric(r[1,]), units="mins") +
as.difftime(as.numeric(r[2,]), units="secs"))
}
return(as.numeric(do.call(f, strsplit(x, ":")), units=units))
}
readQuat <- function(file) {
# Return data
ret <- list()
# Read whole file
lines <- readLines(file)
# Remove lines that need to be ignored
skip_ln <- c()
for(ln in seq_along(lines)) {
if(grepl("^(00A PROGRAM MODE|\\*\\*\\*)", lines[ln]))
skip_ln <- c(skip_ln, -ln)
}
# Also skip past sample lists
sl_head <- "ORDER POS ID CTIME COUNTS CUCNTS MCW REP STD STMS STIME"
sl_tail <- " INSTRUMENT NUMBER 1"
ln <- 1
while (ln <= length(lines)) {
if (grepl(sl_head, lines[ln])) {
while (ln <= length(lines) && !(grepl(sl_tail, lines[ln]))) {
skip_ln <- c(skip_ln, -ln)
ln <- ln + 1
}
}
ln <- ln + 1
}
# Remove unneeded lines
lines <- lines[skip_ln]
#print(lines)
# Get ID
ln <- 1
ID <- NULL
while (ln <= length(lines)) {
if (grepl("^ ID:", lines[ln])) {
obj <- strsplit(lines[ln], ": ")
ID <- obj[[1]][2]
break
}
ln <- ln + 1
}
if(is.null(ID))
stop("could not find ID in file")
attr(ret, "ID") <- ID
# Find lines between ORDER and NUMBER OF CYCLES
ln <- 1
start_ln <- NA
while (ln <= length(lines)) {
if (grepl("^ORDER", lines[ln])) {
start_ln <- ln
break
}
ln <- ln + 1
}
while (ln <= length(lines)) {
if (grepl("NUMBER OF CYCLES", lines[ln])) {
end_ln <- ln - 1
break
}
ln <- ln + 1
}
if (is.na(start_ln)) {
# Reset line number back to top
ln <- 1
} else {
# Read this section as a table
dat1 <- read.table(text=textConnection(lines[start_ln:end_ln]), header=TRUE, as.is=TRUE)
dat1$ID <- factor(dat1$ID)
dat1$CTIME_mins <- MS(dat1$CTIME, units="mins")
dat1$COUNTS <- factor(dat1$COUNTS)
dat1$CUCNTS <- factor(dat1$CUCNTS)
dat1$MCW <- factor(dat1$MCW)
dat1$REP <- factor(dat1$REP)
dat1$STIME <- factor(dat1$STIME)
ret[["dat1"]] <- dat1
}
# Find other data sections
dat2_header <- "CYC POS REP CTIME DTIME1 DTIME2 CUCNTS SQP SQP% STIME$"
#write.csv(dat2_1_header, "dat2_1_header.csv", row.names = FALSE)
#dat2_2_header
dat3_header <- "CYC POS REP CTIME ID CPM1 COUNTS1 CPM2 COUNTS2"
# Find dat2, and gather six lines for each
while (ln <= length(lines)) {
if (grepl(dat2_header, lines[ln])) {
head_ln <- ln
break
}
ln <- ln + 1
}
d2a <- c(head_ln)
d2b <- c(head_ln + 2)
# don't import the other three lines
# d2c <- c(head_ln + 4)
ln <- head_ln + 1
while (ln <= length(lines)) {
if (grepl(dat3_header, lines[ln])) {
head_ln <- ln
break
} else if (grepl("^Q\\d+", lines[ln])) {
d2a <- c(d2a, ln + 1)
d2b <- c(d2b, ln + 2)
# d2c <- c(d2c, ln + 3)
ln <- ln + 6
} else {
ln <- ln + 1
}
}
# Read this section as a table
con <- textConnection(lines[d2a])
dat2a <- read.table(con, header=TRUE, as.is=TRUE)
dat2a$CTIME_mins <- MS(dat2a$CTIME, units="mins")
con <- textConnection(lines[d2b])
dat2b <- read.table(con, header=TRUE, as.is=TRUE)
# combine columns into one data.frame
dat2 <- cbind(dat2a, dat2b)
#test whether run with same name has been started multiple times and if yes delete unwanted results
inx <- with(dat2, which(CYC == 1 & POS == 1 & REP == 1))
inx <- inx[length(inx)]
dat2 <- if(inx > 1) dat2[-seq_len(inx - 1), ] else dat2
ret[["dat2"]] <- dat2
# We should already be on dat3
stopifnot(grepl(dat3_header, lines[head_ln]))
ln <- head_ln
d3a <- c()
d3b <- c(head_ln)
ln <- head_ln
while (ln <= length(lines)) {
if (grepl("^Q\\d+", lines[ln])) {
d3a <- c(d3a, ln)
} else if (grepl("^ *\\d+", lines[ln], perl=TRUE)) {
d3b <- c(d3b, ln)
}
ln <- ln + 1
}
# Read this section as a table
dat3a <- read.table(text=lines[d3a], header=FALSE, as.is=TRUE)
dat3b <- read.table(text=lines[d3b], header=TRUE, as.is=TRUE)
dat3b$ID <- factor(dat3b$ID)
dat3b$CTIME_mins <- MS(dat3b$CTIME, units="mins")
ret[["dat3a"]] <- dat3a
# dat3b_Ordered <- dat3b[order(dat3b$POS), ]
ret[["dat3b"]] <- dat3b
invisible(ret)
}
#combineData <- function(combine) {
# combined <- rbind.fill(combine[["dat2"]], combine[["dat3b"]])
# write.csv(combined, file = "COMBINED.csv", row.names = FALSE)
# ret[["combined"]] <- combined
# ret
#}
processData <- function(dat) {
# Combine data
combined <- rbind.fill(dat[["dat2"]], dat[["dat3b"]])
#Order combined data
combined_ordered <- combined[order(combined$POS), ]
needed_columns <- combined_ordered[ ,c("ID", "POS", "CYC", "COUNTS2", "CTIME_mins")]
#header_retrival <- data.frame(unique_header = character())
header_retrival <- unique(needed_columns[ ,"ID"])
#create table with sample name as header
# all_headers <- paste(rep(header_retrival, each=2), c("counts", "time"), sep="_")
max.cyc <- max(needed_columns$CYC)
counter_output <- data.frame(CYC=1:max.cyc)
for(n in header_retrival) {
sub <- subset(needed_columns, ID==n)
if (nrow(sub) < max.cyc) {
na.row <- needed_columns[0,][1,]
# extend subset with empty rows at end
sub <- rbind(sub, na.row[rep(1, max.cyc - nrow(sub)),])
}
counter_output[,paste(n, "counts", sep="_")] <- sub$COUNTS2
counter_output[,paste(n, "time", sep="_")] <- sub$CTIME_mins
}
counter_output$CYC <- NULL
counter <- length(header_retrival)
#counter <- 1
#for(i in 1:counter) {
# header <- header_retrival[i]
#write.csv(header, "blabla.csv", row.names = FALSE)
# header_paste_helper_counts <- paste0(header, "_counts")
# header_paste_helper_time <- paste0(header, "_time")
# counter_output <- data.frame(header_paste_helper_counts,
# header_paste_helper_time,
# fix.empty.names = FALSE
# )
#counter_output[ ,i] <- header_paste_helper_counts
#counter_output[ ,i+1] <- header_paste_helper_time
#j <- NULL
#counter2 <- nrow(needed_columns)
#for(j in 1:counter2) {
#}
#}
# Export CSV files, etc
ID <- attr(dat, "ID")
file2 <- paste0(ID, "_dat2.csv")
message(paste("Writing", file2))
write.csv(dat[["dat2"]], file2, row.names = FALSE)
file3b <- paste0(ID, "_dat3b.csv")
message(paste("Writing", file3b))
write.csv(dat[["dat3b"]], file3b, row.names = FALSE)
file_combined <- paste0(ID, "_combined.csv")
message(paste("Writing", file_combined))
write.csv(combined, file_combined, row.names = FALSE)
file_combined_ordered <- paste0(ID, "_combined_ordered.csv")
message(paste("Writing", file_combined_ordered))
write.csv(combined_ordered, file_combined_ordered, row.names = FALSE)
file_needed_columns <- paste0(ID, "_needed_columns.csv")
message(paste("Writing", file_needed_columns))
write.csv(needed_columns, file_needed_columns, row.names = FALSE)
file_header_retrival <- paste0(ID, "_header_retrival.csv")
message(paste("Writing", file_header_retrival))
write.csv(header_retrival, file_header_retrival, row.names = FALSE)
file_counter <- paste0(ID, "_counter.csv")
message(paste("Writing", file_counter))
write.csv(counter, file_counter, row.names = FALSE)
file_counter_output <- paste0(ID, "_counter_output.csv")
message(paste("Writing", file_counter_output))
write.csv(counter_output, file_counter_output, row.names = FALSE)
}
if (!interactive()) {
# Processing with Rcmd
args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1)
stop("one file needs to be supplied")
file <- args[1]
dat <- readQuat(file)
processData(dat)
}
Example for 6 line format:
> CYC POS REP CTIME DTIME1 DTIME2 CUCNTS SQP SQP%
> STIME
>
> ID CPM1 COUNTS1 CPM1% CPM2 COUNTS2 CPM2%
>
> CPM3 COUNTS3 CPM3% CPM4 COUNTS4 CPM4%
>
> CPM5 COUNTS5 CPM5% CPM6 COUNTS6 CPM6%
>
> CPM7 COUNTS7 CPM7% CPM8 COUNTS8 CPM8%
>
> Q010101N.001 20 SEP 2017 9:06 1 1 1 5:00.900 11.827
> 11.808 55069 .00 .00 0:00
> WSTD 11429.37 55069 .43 63994.43 308338 .18
> .00 0 .00 .00 0 .00
> .00 0 .00 .00 0 .00
> .00 0 .00 .00 0 .00
Example for 2 line format:
CYC POS REP CTIME ID CPM1 COUNTS1 CPM2 COUNTS2
Q030301N.001 21 SEP 2017 13:14
1 3 1 100:01.749 B1 .23 23 .68 67
So basically it cannot find dat2_header
and needs to carry on in finding dat3_header
. However it needs to be flexible to do both if needed