Update:
Thanks to @aelwan for catching a bug, and the updated code is below:
library(ggplot2)
library(reshape2)
# read in the data
dfStage = read.csv("reshapeR/Data/stage.csv", header = FALSE, stringsAsFactor = FALSE)
# remove the rows which are min, max, mean & redundant columns
condMMM = stringr::str_trim(dfStage[, 1]) %in% c("Min", "Max", "Mean", "Day")
dfStage = dfStage[!condMMM, 1:13]
dateVars = c("Day", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
colnames(dfStage) = dateVars
# get indices & names of year site combinations
condlSiteYear = grepl("^Daily means", stringr::str_trim(dfStage[, 1]))
condiSiteYear = grep("^Daily means", stringr::str_trim(dfStage[, 1]))
dfSiteYear = dfStage[condlSiteYear, 1, drop = FALSE]
# remove site-year rows from data
dfStage = dfStage[!condlSiteYear, ]
# get the list of sites and years
dfSiteYear$Year = regmatches(dfSiteYear[, 1], regexpr("(?<=Year\\s)([0-9]+)", dfSiteYear[, 1], perl = TRUE))
dfSiteYear$Site = regmatches(dfSiteYear[, 1],
regexpr("(?<=(Stage\\s\\(mm\\)\\sat\\s))([A-Za-z\\s0-9\\.]+)", dfSiteYear[, 1], perl = TRUE))
# add the site and years
dfSiteYearLong = dfSiteYear[rep(1:dim(dfSiteYear)[1], each = 31), c("Site", "Year")]
dfStageFinal = cbind(dfStage, dfSiteYearLong)
# reshape
dfStageFinalLong = reshape2::melt(dfStageFinal, id.vars = c("Day", "Site", "Year"),
measure.vars = dateVars[-1],
variable.name = "Month")
dfStageFinalWide = reshape2::dcast(dfStageFinalLong, Day + Month + Year ~ Site,
value.var = "value")
# cleanup
dfStageFinalWide[, -c(1:3)] = lapply(dfStageFinalWide[, -c(1:3)], as.numeric)
# create a date variable
dfStageFinalWide$Date = with(dfStageFinalWide,
as.Date(paste(Day, Month, Year, sep = "-"),
format = "%d-%b-%Y"))
# remove the infeasible dates
dfStageFinalWide = dfStageFinalWide[!is.na(dfStageFinalWide$Date), ]
dfStageFinalWide = dfStageFinalWide[order(dfStageFinalWide$Date), ]
# plot the values over time
dfStageFinalLong =
reshape2::melt(dfStageFinalWide, id.vars = "Date", measure.vars = unique(dfSiteYear$Site),
variable.name = "Site")
ggplot(dfStageFinalLong, aes(x = Date, y = value, color = Site))+
geom_line() + theme_bw() + facet_wrap(~ Site, scale = "free_y")
This leads to the picture below:

Original answer:
This example requires a fair amount of data munging skills. You basically have to note the repeating patters in the data -- the data are site-year measurements organized as day x month tables.
Recipe:
Here is a recipe for creating the desired dataset:
1. Remove the rows & columns in the data that are redundant.
2. Extract the rows that identify the year and the site of the table using pattern matching (grep
).
3. From the longer string, extract the year and site name using regular expressions (regexpr
and regmatches
).
4. Find the starting row indices of the tables for each site-year combination and assign the site-year names just extracted to all rows that correspond to that site & year.
5. Now you can go ahead and reshape it into any shape you want. In the code below, the row identifiers are year, month and day, and the columns are the sites.
6. Some cleanup, and you are good to go.
Code:
Here is code for the recipe above:
# read in the data
dfStage = read.csv("reshapeR/Data/stage.csv", header = FALSE, stringsAsFactor = FALSE)
# remove the rows which are min, max, mean & redundant columns
condMMM = stringr::str_trim(dfStage[, 1]) %in% c("Min", "Max", "Mean", "Day")
dfStage = dfStage[!condMMM, 1:13]
dateVars = c("Day", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
colnames(dfStage) = dateVars
# get indices & names of year site combinations
condlSiteYear = grepl("^Daily means", stringr::str_trim(dfStage[, 1]))
condiSiteYear = grep("^Daily means", stringr::str_trim(dfStage[, 1]))
dfSiteYear = dfStage[condlSiteYear, 1, drop = FALSE]
# remove site-year rows from data
dfStage = dfStage[!condlSiteYear, ]
# get the list of sites and years
dfSiteYear$Year = regmatches(dfSiteYear[, 1], regexpr("(?<=Year\\s)([0-9]+)", dfSiteYear[, 1], perl = TRUE))
dfSiteYear$Site = regmatches(dfSiteYear[, 1],
regexpr("(?<=(Stage\\s\\(mm\\)\\sat\\s))([A-Za-z\\s0-9\\.]+)", dfSiteYear[, 1], perl = TRUE))
# add the site and years
dfSiteYearLong = dfSiteYear[rep.int(1:dim(dfSiteYear)[1], 31), c("Site", "Year")]
dfStageFinal = cbind(dfStage, dfSiteYearLong)
# reshape
dfStageFinalLong = reshape2::melt(dfStageFinal, id.vars = c("Day", "Site", "Year"), measure.vars = dateVars[-1],
variable.name = "Month")
dfStageFinalWide = dcast(dfStageFinalLong, Day + Month + Year ~ Site, value.var = "value")
# cleanup
dfStageFinalWide[, -c(1:3)] = lapply(dfStageFinalWide[, -c(1:3)], as.numeric)
# create a date variable
dfStageFinalWide$Date = with(dfStageFinalWide,
as.Date(paste(Day, Month, Year, sep = "-"),
format = "%d-%b-%Y"))
# remove the infeasible dates
dfStageFinalWide = dfStageFinalWide[!is.na(dfStageFinalWide$Date), ]
dfStageFinalWide = dfStageFinalWide[order(dfStageFinalWide$Date), ]
# plot the values over time
dfStageFinalLong =
melt(dfStageFinalWide, id.vars = "Date", measure.vars = unique(dfSiteYear$Site),
variable.name = "Site")
ggplot(dfStageFinalLong, aes(x = Date, y = value, color = Site))+
geom_line() + theme_bw() + facet_wrap(~ Site, scale = "free_y")
Output:
Here is what the output looks like:
> head(dfStageFinalWide)
Day Month Year Kumeti at Te Rehunga Makakahi at Hamua Makuri at Tuscan Hills Manawatu at Hopelands Manawatu at Upper Gorge Manawatu at Weber Road Mangahao at Ballance
1 1 Jan 1990 454 NA 700 5133 NA NA NA
2 1 Jan 1991 1002 3643 1416 50 3597 1836 18160
3 1 Jan 1992 3490 34239 8922 3049 1221 417 NA
4 1 Jan 1993 404 NA 396 3408 NA 272 NA
5 1 Jan 1994 NA NA 3189 795 NA 2321 1889
6 1 Jan 1995 16548 1923 69862 4808 NA 6169 94
Mangapapa at Troup Rd Mangatainoka at Larsons Road Mangatainoka at Pahiatua Town Bridge Mangatainoka at Tararua Park Mangatoro at Mangahei Road Oruakeretaki at S.H.2 Napier
1 9406 2767 NA NA 6838 2831
2 4985 2479 823 1078 76 105
3 478 3665 1415 210 394 8247
4 6394 1298 NA 2668 3837 1878
5 14051 3561 NA 2645 807 NA
6 NA 1057 7029 4497 NA NA
Raparapawai at Jackson Rd Tamaki at Stephensons Tiraumea at Ngaturi
1 5189 50444 17951
2 345 416 3025
3 1364 5713 1710
4 3457 28078 8670
5 199 NA 292
6 NA NA 22774
And a picture to bring it all together.
