2

I’m trying to break down my server-log into multiple files so I can run some metrics on them. I have this cronjob that adds a string and a timestamp to my server-log at the first of every month, the string looks like this ‘Monthly Breakpoint, March 1 2020’. The idea is that I can break up this large server-log file into multiple log files by this line delimiter, then run some metrics on each file. I’m trying to write a script that will create these output files for me but I’m struggling with it. So far I can read the file and loop through the lines and find the delimiter, but I’m not sure the best approach for a problem like this, maybe I shouldn't be using R and there's an easier way?

# server log
serverLog <- "server-out.log"

# Process File 
conn <- file( serverLog ,open="r")
linn <-readLines(conn)
for (i in 1:length(linn)){
  print( linn[i] )
  test <- grepl(  "Monthly", linn[i] )
  # print( paste("test: ", test, sep="" ) )
  if( test ) {
    print( "Found Monthly Breakpoint")
  }
}
close(conn)

# Example of the server-out.log file 

[0mGET /notifications [36m304 [0m9.439 ms - -[0m
[0mGET /user/status [36m304 [0m2.137 ms - -[0m
[0mGET /user/status [36m304 [0m5.675 ms - -[0m
[0mPOST /user/login [32m200 [0m19.960 ms - 30[0m
[0mGET /user/status [36m304 [0m9.518 ms - -[0m
[0mGET /user/status [32m200 [0m2.364 ms - 16[0m
[0mGET /user/status [36m304 [0m1.396 ms - -[0m
[0mGET /user/status [36m304 [0m1.087 ms - -[0m
[0mPOST /user/login [32m200 [0m300.214 ms - 30[0m
[0mGET /user/status [36m304 [0m4.374 ms - -[0m
[0mGET /localUser [32m200 [0m2.260 ms - 1045[0m

 Monthly Breakpoint, March 1 2020

[0mGET /user/status [32m200 [0m5.284 ms - 16[0m
[0mGET /user/status [36m304 [0m2.101 ms - -[0m
[0mGET /users [32m200 [0m2.387 ms - 36[0m
[0mGET /notifications [32m200 [0m30.395 ms - 2624[0m
[0mGET /user/status [36m304 [0m2.172 ms - -[0m
[0mGET /user/status [36m304 [0m1.424 ms - -[0m
[0mGET /user/status [36m304 [0m2.074 ms - -[0m
[0mGET /user/status [36m304 [0m0.920 ms - -[0m
[0mGET /users [36m304 [0m2.471 ms - -[0m
[0mGET /notifications [36m304 [0m8.416 ms - -[0m
[0mGET /user/status [36m304 [0m1.757 ms - -[0m
[0mGET /user/status [36m304 [0m1.114 ms - -[0m
[0mGET /favicon.ico [33m404 [0m2.218 ms - 150[0m
[0mGET /user/status [36m304 [0m2.003 ms - -[0m
[0mPOST /user/login [32m200 [0m175.473 ms - 30[0m
[0mGET /user/status [36m304 [0m3.893 ms - -[0m
  • Update I tried using csplit because it sounds like a good option for this problem, but I can't get that working either.. can you provide an example?
csplit -z server-out.min /Monthly/ '{*}'

csplit: illegal option -- z
usage: csplit [-ks] [-f prefix] [-n number] file args ...
MostlyRquestions
  • 526
  • 2
  • 7
  • 22
  • You could use csplit (see https://unix.stackexchange.com/questions/263904/split-file-into-multiple-files-based-on-pattern) to break up the file and then read them in as a list of files. – user12728748 Mar 17 '20 at 15:30
  • There may be another tool with that name, but I referred to the one in GNU coreutils. See https://www.gnu.org/software/coreutils/manual/html_node/csplit-invocation.html for documentation. – user12728748 Mar 17 '20 at 16:06
  • @user12728748 can you provide an example? – MostlyRquestions Mar 17 '20 at 16:08
  • `sed '/^$/d' out.log | csplit --suppress-matched -z - /"Monthly Breakpoint"/ {*}` – user12728748 Mar 17 '20 at 16:39

3 Answers3

1

Probably using some UNIX commands would be most "natural", awk and csplit would work in that regard.

I have an R solution for you anyways. Instead of using readLines() I would start with read.delim(). That way you start out with a data.frame and can then use any tools for data.frame manipulation. I am most familiar with the tidyverse commands, that's why I would use them here.

# Process File 
library(tidyverse)
log_df <- read.delim(serverLog, header = FALSE) %>% 
  mutate(breakpoint = grepl("Monthly Breakpoint", V1),
         breakdate = ifelse(breakpoint, gsub("Monthly Breakpoint, ", "", V1), NA)) %>% 
  fill(breakdate) %>% 
  mutate(breakdate = ifelse(is.na(breakdate), "before first breakdate", breakdate)) %>% 
  filter(!breakpoint) %>% 
  select(-breakpoint)

# Save Files
log_df %>% 
  split(.$breakdate) %>% 
  lapply(function(x) write.csv(x, file = paste(x$breakdate[1], ".csv"), row.names = FALSE))

I don't know though, if storing the data in separate files is the best workflow to choose here. Why not just keep the data in R, split up the lines in several columns and group your analysis by months.

EDIT: This is what the splitting into columns and some analysis could look like.

# split / separate() into columns

log_sep_df <- 
  log_df %>%
  as_tibble() %>% 
  mutate(V1 = substr(V1, 2, nchar(as.character(V1)))) %>% 
  separate(V1, into = c(paste0("var", 1:10)), sep = "\\[|  | ") %>% 
  mutate(http = ifelse(grepl("POST", var1), "POST", "GET")) %>% 
  mutate(var1 = gsub("POST|GET", "", var1))

# get month labels
library(lubridate)
log_sep_df <- 
log_sep_df %>% 
  mutate(date = as.Date(mdy(log_sep_df$breakdate)))

date_before_first_breakpoint <- min(log_sep_df$date, na.rm = TRUE) - 10

log_sep_df <- 
log_sep_df %>% 
  mutate(date = if_else(is.na(date), 
                        date_before_first_breakpoint, 
                        date),
         month = month(date, label = TRUE))


# grouped visiualization of logs
ggplot(log_sep_df, aes(http)) +
  geom_bar() +
  facet_wrap(~month)
Till
  • 3,845
  • 1
  • 11
  • 18
1

This isn't the most elegant answer but this got me what I needed. I'll try out the other answer, it's a good idea to keep the data in my R environment so I can run all my metrics without reading in unnecessary files. Thanks @Till

#~~~~~~~~~~~~~~~~~~~~~~#
#~~ Parse Server Log ~~#
#~~~~~~~~~~~~~~~~~~~~~~#

# Read File 
serverLog <- "server-out.min"
conn <- file( serverLog ,open="r")
linn <-readLines(conn)
num <- 1

# Loop through File 
for (i in 1:length(linn)){
  # print( linn[i] )

  # current output file
  file <- paste( "server-log-", num, sep = "")
  # write to file
  write(linn[i], file=file, append=TRUE)

  # Check for Monthly Delimiter, update num
  test <- grepl(  "Monthly", linn[i] )
  if( test ) {
    print( "Found Monthly Breakpoint")
    num <- num+1
  }
}
close(conn)
MostlyRquestions
  • 526
  • 2
  • 7
  • 22
  • I added some code of how the splitting into columns etc. could be done to my answer. – Till Mar 17 '20 at 17:27
0

If you want to do it in R, you could use a data.table solution for efficiency:

library(data.table)
DT <- fread("out.log", sep = NULL, header = FALSE)[V1 != ""]
DT[, Idx := rleid(grepl("Monthly Breakpoint", V1))]
DT <- DT[!grepl("Monthly Breakpoint", V1)]
DT.list <- split(DT, DT$Idx) ## or just operate by Idx
user12728748
  • 8,106
  • 2
  • 9
  • 14