0

I am a beginner in R and I have a big data.frame (more than 300000 obs) that look like this:

Dados <- data.frame(stringsAsFactors=FALSE,
               id = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
                      14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L,
                      25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
                      37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L,
                      49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L,
                      61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L,
                      72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L,
                      84L, 85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L,
                      96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L,
                      106L, 107L, 108L, 109L, 110L, 111L, 112L, 113L, 114L, 115L,
                      116L, 117L, 118L, 119L, 120L, 121L, 122L, 123L, 124L, 125L,
                      126L, 127L, 128L, 129L, 130L, 131L, 132L, 133L, 134L, 135L,
                      136L, 137L, 138L, 139L, 140L, 141L, 142L, 143L),
   Identification = "LONNIE POOL FIELD WEAVERVILLE",
            Dates = c("1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014", "1/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014", "2/01/2014",
                      "2/01/2014", "2/01/2014", "2/01/2014"),
     TEMP_Celcius = c(13L, 10L, 8L, 7L, 5L, 4L, 3L, 3L, 2L, 2L, 2L, 2L, 0L, 0L,
                      0L, 0L, 0L, 0L, 0L, 0L, -1L, -1L, -2L, -1L, -2L, -2L,
                      -2L, -2L, -2L, -2L, -2L, -2L, -3L, -3L, -3L, -3L, -3L, -3L,
                      -3L, -3L, -4L, -4L, -3L, -4L, -4L, -4L, -4L, -4L, -4L, -3L,
                      -3L, -2L, 0L, 1L, 2L, 3L, 4L, 6L, 6L, 8L, 9L, 9L, 10L, 11L,
                      12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 12L, 10L, 9L, 8L,
                      6L, 5L, 5L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 2L, 0L, 1L, 0L, 0L,
                      0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, -1L, -1L, -1L, -2L, -2L,
                      -1L, -2L, -2L, -2L, -2L, -2L, -2L, -2L, -2L, -3L, -3L, -3L,
                      -3L, -3L, -3L, -3L, -3L, -2L, -2L, 0L, 0L, 1L, 3L, 4L, 5L,
                      6L, 7L, 8L, 9L, 10L, 10L, 12L, 13L, 13L, 13L, 13L, 14L, 14L,
                      14L))

And I need to get others columns, like average temp, min, max, and average dew point max and min each per day. I have many results per day because it is hourly. I tried many ways but I am getting a wrong result.

First I tried to get the mean with this code:

tapply (Dados$TEMP_Celcius, Dados$Dates, mean) But I get wrong result. For example for the date 01-01-2014 I get 27.8 and the right result would be 1.97.

I tried either the codes:

tapply(Dados$TEMP_Celcius, Dados$Dates, mean, na.rm = TRUE)

aggregate(Dados$TEMP_Celcius, by=list(TMEDIA=Dados$Dates), mean)

But I got the same results. I do not know what I am doing wrong, could you help me please?

I already checked the class of the column Dates and it is "Date" and of teh variable temperature and it is "numeric".

Suraj Kumar
  • 5,547
  • 8
  • 20
  • 42

1 Answers1

0

Without actually seeing what data you have, perhaps you could try this? It uses the tidyverse (which you should learn as it will make everything much easier).

library(tidyverse)
Dados %>% 
  group_by(Dates) %>% 
  summarise(mean = mean(TEMP_Celcius), min = min(TEMP_Celcius), max = max(TEMP_Celcius))

This gave me this output:

# A tibble: 2 x 4
  Dates      mean   min   max
  <chr>     <dbl> <dbl> <dbl>
1 1/01/2014  1.97    -4    13
2 2/01/2014  2.75    -3    14

Updated after suggestion from @Jon Spring:

library(tidyverse)
Dados %>% 
  group_by(Identification, Dates) %>% 
  summarise(mean = mean(TEMP_Celcius), min = min(TEMP_Celcius), max = max(TEMP_Celcius))

Output:

# A tibble: 2 x 5
# Groups:   Identification [?]
  Identification                Dates      mean   min   max
  <chr>                         <chr>     <dbl> <dbl> <dbl>
1 LONNIE POOL FIELD WEAVERVILLE 1/01/2014  1.97    -4    13
2 LONNIE POOL FIELD WEAVERVILLE 2/01/2014  2.75    -3    14
william3031
  • 1,653
  • 1
  • 18
  • 39
  • 1
    Are there multiple locations in your data? If so, you should `group_by(Identification, Dates) %>%` in the code above, otherwise you will be getting the average of all locations on each day. – Jon Spring Mar 18 '19 at 03:33
  • Works for me, see the results above. (Sorry, deleted comment accidently). – william3031 Mar 18 '19 at 03:34