0

I've been trying to load up this function that will take data from three different data frames with some baseball data and then produce a fourth frame with averages of that data that is weighted depending on if the player appears in all 3 frames (played all 3 years) or less, and so on.

The data frames are linked by the fact that each has a playerid which is what I'm using to check how many and which years they specifically appear in. I also have a fourth data frame that has a master list of the playerids and corresponding names that I use to establish the list of IDs to check.

Below is what my data frames with the stats look like of which I have 3 spanning back to 2016

> head(batters_18)
  player_id       player_name launch_speed launch_angle
1    592450       Aaron Judge         94.7         12.4
2    408234    Miguel Cabrera         94.4          7.3
3    443558       Nelson Cruz         93.9         12.8
4    608336        Joey Gallo         93.8         21.5
5    519317 Giancarlo Stanton         93.7         11.6
6    623520        David Bote         93.5          3.5

this is my actual function code

# to combine batting stats from the 3 seasons in the appropriate categories
# but with a weighting of 45% in 2018, 35% in 2017, and 20% in 2016 for sake
# of favoring recent form and performance, but in each seasons all players have
# at least 50 events

combine.batting.stats <- function(batters_16, batters_17, batters_18, playerID_map){

  #using the stats for each year along with the player ID map

  b18 = batters_18
  b17 = batters_17
  b16 = batters_16
  playerID_map = playerID_map
  playerid = playerID_map$MLBID

  # so first my weights with the scenarios being 
  # exists in all 3 years, exits in exactly two, and finally exists exactly one



  # the check for whether something is in a data frame is as below
  # SOMETHING %in% DATAFRAME$COLUMN
  # this should be used to code three different scenarios where I weight 
  # the value of season stats depending on how may seasons they qualify in

  if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = TRUE
     & playerid %in% b16$player_id = TRUE) {

    #calculation for case of 3 year player
    # 18 is 45%, 17 is 35%, and 16 is 20%

    average_launch_speed = (((b18$launch_speed * 0.45) + (b17$launch_speed * 0.35)
                             + (b16$launch_speed * 0.2)) / 3)

    average_launch_angle = (((b18$launch_angle * 0.45) + (b17$launch_angle * 0.35)
                             + (b16$launch_angle * 0.2)) / 3)

  }

  if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = TRUE
     & playerid %in% b16$player_id = FALSE) {

    #calculation for player in b18 and b17 but not b16....should be extended to
    #other 2 year player situations that is b17 and b16 but not b18 as well as
    #b18 and b16 but not b17 (which I would like to skew even more to b18 stats)
    #than players who have played the most recent 2 years to reflect potential 
    #post injury change

    average_launch_speed = (((b18$launch_speed * 0.6) + (b17$launch_speed * 0.4)) 
                            / 2)

    average_launch_angle = (((b18$launch_angle * 0.6) + (b17$launch_angle * 0.4)) 
                            / 2)

  }

  if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = FALSE
     & playerid %in% b16$player_id = TRUE) {

    #in b18 and b16 but not b17


    average_launch_speed = (((b18$launch_speed * 0.6) + (b16$launch_speed * 0.4)) 
                            / 2)

    average_launch_angle = (((b18$launch_angle * 0.6) + (b16$launch_angle * 0.4)) 
                            / 2)

  }

  if(playerid %in% b18$player_id = FALSE & playerid %in% b17$player_id = TRUE
     & playerid %in% b16$player_id = TRUE) {

    #in b17 and b16 but not b18


    average_launch_speed = (((b17$launch_speed * 0.6) + (b16$launch_speed * 0.4)) 
                            / 2)

    average_launch_angle = (((b17$launch_angle * 0.6) + (b16$launch_angle * 0.4)) 
                            / 2)

  }

  # next are those in only one single frame/year
  # this one is only in 18

  if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = FALSE
     & playerid %in% b16$player_id = FALSE){

    average_launch_speed = b18$launch_speed

    average_launch_angle = b18$launch_angle 

  }

  # only in b17

  if(playerid %in% b18$player_id = FALSE & playerid %in% b17$player_id = TRUE
     & playerid %in% b16$player_id = FALSE){

    average_launch_speed = b17$launch_speed

    average_launch_angle = b17$launch_angle 

  }

  #only in b16

  if(playerid %in% b18$player_id = FALSE & playerid %in% b17$player_id = FALSE
     & playerid %in% b16$player_id = TRUE){

    average_launch_speed = b16$launch_speed

    average_launch_angle = b16$launch_angle 

  }

  # returning a data frame from the function
  combined_stats = data.frame(playerid, average_launch_speed, average_launch_angle)

}

and then this is the error I get in the console which I just don't understand the issue with my function

> # to combine batting stats from the 3 seasons in the appropriate categories
> # but with a weighting of 45% in 2018, 35% in 2017, and 20% in 2016 for sake
> # of favoring recent form and performance, but in each seasons all players have
> # at least 50 events
> 
> combine.batting.stats <- function(batters_16, batters_17, batters_18, playerID_map){
+   
+   #using the stats for each year along with the player ID map
+   
+   b18 = batters_18
+   b17 = batters_17
+   b16 = batters_16
+   playerID_map = playerID_map
+   playerid = playerID_map$MLBID
+   
+   # so first my weights with the scenarios being 
+   # exists in all 3 years, exits in exactly two, and finally exists exactly one
+   
+   
+   
+   # the check for whether something is in a data frame is as below
+   # SOMETHING %in% DATAFRAME$COLUMN
+   # this should be used to code three different scenarios where I weight 
+   # the value of season stats depending on how may seasons they qualify in
+   
+   if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = TRUE
Error: unexpected '=' in:
"  
  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = TRUE) {
Error: unexpected '&' in "     &"
>     
>     #calculation for case of 3 year player
>     # 18 is 45%, 17 is 35%, and 16 is 20%
>     
>     average_launch_speed = (((b18$launch_speed * 0.45) + (b17$launch_speed * 0.35)
+                              + (b16$launch_speed * 0.2)) / 3)
Error: object 'b18' not found
>     
>     average_launch_angle = (((b18$launch_angle * 0.45) + (b17$launch_angle * 0.35)
+                              + (b16$launch_angle * 0.2)) / 3)
Error: object 'b18' not found
>     
>   }
Error: unexpected '}' in "  }"
>   
>   if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = TRUE
Error: unexpected '=' in "  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = FALSE) {
Error: unexpected '&' in "     &"
>     
>     #calculation for player in b18 and b17 but not b16....should be extended to
>     #other 2 year player situations that is b17 and b16 but not b18 as well as
>     #b18 and b16 but not b17 (which I would like to skew even more to b18 stats)
>     #than players who have played the most recent 2 years to reflect potential 
>     #post injury change
>     
>   }
Error: unexpected '}' in "  }"
>     
>     
>   data.frame(check.rows = FALSE)
data frame with 0 columns and 0 rows
>   
> }
Error: unexpected '}' in "}"
> }
Error: unexpected '}' in "}"
> # to combine batting stats from the 3 seasons in the appropriate categories
> # but with a weighting of 45% in 2018, 35% in 2017, and 20% in 2016 for sake
> # of favoring recent form and performance, but in each seasons all players have
> # at least 50 events
> 
> combine.batting.stats <- function(batters_16, batters_17, batters_18, playerID_map){
+   
+   #using the stats for each year along with the player ID map
+   
+   b18 = batters_18
+   b17 = batters_17
+   b16 = batters_16
+   playerID_map = playerID_map
+   playerid = playerID_map$MLBID
+   
+   # so first my weights with the scenarios being 
+   # exists in all 3 years, exits in exactly two, and finally exists exactly one
+   
+   
+   
+   # the check for whether something is in a data frame is as below
+   # SOMETHING %in% DATAFRAME$COLUMN
+   # this should be used to code three different scenarios where I weight 
+   # the value of season stats depending on how may seasons they qualify in
+   
+   if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = TRUE
Error: unexpected '=' in:
"  
  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = TRUE) {
Error: unexpected '&' in "     &"
>     
>     #calculation for case of 3 year player
>     # 18 is 45%, 17 is 35%, and 16 is 20%
>     
>     average_launch_speed = (((b18$launch_speed * 0.45) + (b17$launch_speed * 0.35)
+                              + (b16$launch_speed * 0.2)) / 3)
Error: object 'b18' not found
>     
>     average_launch_angle = (((b18$launch_angle * 0.45) + (b17$launch_angle * 0.35)
+                              + (b16$launch_angle * 0.2)) / 3)
Error: object 'b18' not found
>     
>   }
Error: unexpected '}' in "  }"
>   
>   if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = TRUE
Error: unexpected '=' in "  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = FALSE) {
Error: unexpected '&' in "     &"
>     
>     #calculation for player in b18 and b17 but not b16....should be extended to
>     #other 2 year player situations that is b17 and b16 but not b18 as well as
>     #b18 and b16 but not b17 (which I would like to skew even more to b18 stats)
>     #than players who have played the most recent 2 years to reflect potential 
>     #post injury change
>     
>     average_launch_speed = (((b18$launch_speed * 0.6) + (b17$launch_speed * 0.4)) 
+                             / 2)
Error: object 'b18' not found
>     
>     average_launch_angle = (((b18$launch_angle * 0.6) + (b17$launch_angle * 0.4)) 
+                             / 2)
Error: object 'b18' not found
>     
>   }
Error: unexpected '}' in "  }"
>   
>   if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = FALSE
Error: unexpected '=' in "  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = TRUE) {
Error: unexpected '&' in "     &"
>     
>     #in b18 and b16 but not b17
>     
>     
>     average_launch_speed = (((b18$launch_speed * 0.6) + (b16$launch_speed * 0.4)) 
+                             / 2)
Error: object 'b18' not found
>     
>     average_launch_angle = (((b18$launch_angle * 0.6) + (b16$launch_angle * 0.4)) 
+                             / 2)
Error: object 'b18' not found
>     
>   }
Error: unexpected '}' in "  }"
>   
>   if(playerid %in% b18$player_id = FALSE & playerid %in% b17$player_id = TRUE
Error: unexpected '=' in "  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = TRUE) {
Error: unexpected '&' in "     &"
>     
>     #in b17 and b16 but not b18
>     
>     
>     average_launch_speed = (((b17$launch_speed * 0.6) + (b16$launch_speed * 0.4)) 
+                             / 2)
Error: object 'b17' not found
>     
>     average_launch_angle = (((b17$launch_angle * 0.6) + (b16$launch_angle * 0.4)) 
+                             / 2)
Error: object 'b17' not found
>     
>   }
Error: unexpected '}' in "  }"
>     
>   # next are those in only one single frame/year
>   # this one is only in 18
>   
>   if(playerid %in% b18$player_id = TRUE & playerid %in% b17$player_id = FALSE
Error: unexpected '=' in "  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = FALSE){
Error: unexpected '&' in "     &"
>     
>     average_launch_speed = b18$launch_speed
Error: object 'b18' not found
>     
>     average_launch_angle = b18$launch_angle 
Error: object 'b18' not found
>     
>   }
Error: unexpected '}' in "  }"
>     
>   # only in b17
>   
>   if(playerid %in% b18$player_id = FALSE & playerid %in% b17$player_id = TRUE
Error: unexpected '=' in "  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = FALSE){
Error: unexpected '&' in "     &"
>     
>     average_launch_speed = b17$launch_speed
Error: object 'b17' not found
>     
>     average_launch_angle = b17$launch_angle 
Error: object 'b17' not found
>     
>   }
Error: unexpected '}' in "  }"
>   
>   #only in b16
>   
>   if(playerid %in% b18$player_id = FALSE & playerid %in% b17$player_id = FALSE
Error: unexpected '=' in "  if(playerid %in% b18$player_id ="
>      & playerid %in% b16$player_id = TRUE){
Error: unexpected '&' in "     &"
>     
>     average_launch_speed = b16$launch_speed
Error: object 'b16' not found
>     
>     average_launch_angle = b16$launch_angle 
Error: object 'b16' not found
>     
>   }
Error: unexpected '}' in "  }"
>   
>   # returning a data frame from the function
>   combined_stats = data.frame(playerid, average_launch_speed, average_launch_angle)
Error in data.frame(playerid, average_launch_speed, average_launch_angle) : 
  object 'average_launch_speed' not found
>   
> }
Error: unexpected '}' in "}"
8bit
  • 49
  • 1
  • 7
  • 4
    Pay attention to the difference `=` vs. `==`. – jogo Nov 05 '18 at 14:32
  • @jogo is that how I assign an object? I've been using '=' because I think it works the same as '<-' in the console, is it bad form? and inside my ifs should I use '==' or '<-' instead? – 8bit Nov 05 '18 at 14:48
  • It is about this parts: `if(playerid %in% b18$player_id = TRUE & ...)`; imho there you want `==`. ... but `... == TRUE` is not necessary. `(5 > 3) == TRUE` is the same as `(5 > 3)`. But `(5 > 3) = TRUE` gives an error. – jogo Nov 05 '18 at 14:50
  • `=` is for assignments, while `==` is for comparison – camille Nov 05 '18 at 14:50
  • @jogo ohhh I understand, when doing the test for whether a player is in a year I should use '==' but what about all the other errors about things being unexpected? And does my code look like it will work otherwise? – 8bit Nov 05 '18 at 14:55
  • 1
    Your code is not minimal and (without the data) not reproducible. Please read https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example – jogo Nov 05 '18 at 14:57
  • @jogo thanks for that I'm trying to cut the code and data down now - but i also have everything up on github under code here https://github.com/hammadm1012/hits/issues/1 – 8bit Nov 05 '18 at 16:49
  • With your GitHub sources, I am unable to reproduce your error which refers to an `lapply` call which is nowhere in your source code. – Parfait Nov 05 '18 at 18:19
  • @Parfait sorry I forgot to include that I put the frames in a data list and ran this lapply command in console completed = lapply(data.list, combine.batting.stats) – 8bit Nov 05 '18 at 18:23
  • You cannot call it like that as your `combine.batting.stats` expects to receive 4 non-optional arguments. Here you expect one data frame argument. – Parfait Nov 05 '18 at 18:30
  • @Parfait but when using lapply how can I list all four arguments? It always expects one and then the function -- what would I use in its place if I want every line to be tested (that is every player ID in the playerID_map – 8bit Nov 05 '18 at 19:14

1 Answers1

1

Reconsider your approach with mutually exclusive if logic based on %in% lists as you will receive the following warnings where the logical expression you test will result in a multiple item logical vector (i.e., c(TRUE, TRUE, FALSE, FALSE, ...) since the left hand input to %in% is not one value:

In addition: Warning messages:
1: In if (playerid %in% b18$player_id == TRUE & playerid %in% b17$player_id == : the condition has length > 1 and only the first element will be used
2: In if (playerid %in% b18$player_id == TRUE & playerid %in% b17$player_id == : the condition has length > 1 and only the first element will be used
3: In if (playerid %in% b18$player_id == TRUE & playerid %in% b17$player_id == : the condition has length > 1 and only the first element will be used
...

Instead, consider merging all your four data frames together then running your calculations with vectorized ifelse() that checks across equal length columns in same data frame:

Data

batters_16 <- read.csv("https://raw.githubusercontent.com/hammadm1012/hits/master/batters_16.csv")
batters_17 <- read.csv("https://raw.githubusercontent.com/hammadm1012/hits/master/batters_17.csv")
batters_18 <- read.csv("https://raw.githubusercontent.com/hammadm1012/hits/master/batters_18.csv")

playerID_map <- read.csv("https://raw.githubusercontent.com/hammadm1012/hits/master/playerID_map.csv")

Function

combine.batting.stats_new <-  function(batters_16, batters_17, batters_18, playerID_map){

  # OUTER JOIN MERGE OF DATA FRAMES
  merge_df <- merge(batters_16, batters_17, by="player_id", suffixes=c("_16", "_17"), all=TRUE)
  merge_df <- merge(merge_df, batters_18, by="player_id", all=TRUE)
  merge_df <- merge(merge_df, playerID_map, by.x=c("player_id"), by.y=c("MLBID"), all=TRUE)

  # ADD TWO NEW COLUMNS
  merge_df <- within(merge_df, {
                            # ALL YEARS INCLUDED 
    average_launch_speed <- ifelse(!is.na(launch_speed) & !is.na(launch_speed_17) & !is.na(launch_speed_16),
                                   (((launch_speed * 0.45) + (launch_speed_17 * 0.35) + (launch_speed_16 * 0.2)) / 3),
                                   # MISSING ONLY 2016
                                   ifelse(!is.na(launch_speed) & !is.na(launch_speed_17) & is.na(launch_speed_16),
                                          ((launch_speed * 0.6) + (launch_speed_17 * 0.4)) / 2,
                                          # MISSING ONLY 2017
                                          ifelse(!is.na(launch_speed) & is.na(launch_speed_17) & !is.na(launch_speed_16),
                                                 ((launch_speed * 0.6) + (launch_speed_16 * 0.4)) / 2,
                                                 # MISSING ONLY 2018
                                                 ifelse(is.na(launch_speed) & !is.na(launch_speed_17) & !is.na(launch_speed_16),
                                                        ((launch_speed_17 * 0.6) + (launch_speed_16 * 0.4)) / 2,
                                                        # MISSING  2016 AND 2017
                                                        ifelse(!is.na(launch_speed) & is.na(launch_speed_17) & is.na(launch_speed_16),
                                                               launch_speed,
                                                               # MISSING  2016 AND 2018
                                                               ifelse(is.na(launch_speed) & !is.na(launch_speed_17) & is.na(launch_speed_16),
                                                                      launch_speed_17,
                                                                      # MISSING  2017 AND 2018
                                                                      ifelse(is.na(launch_speed) & is.na(launch_speed_17) & !is.na(launch_speed_16),
                                                                             launch_speed_16, NA
                                                                      )
                                                               )
                                                        )
                                                 )
                                          )
                                   )
                            )

    average_launch_angle <- ifelse(!is.na(launch_angle) & !is.na(launch_angle_17) & !is.na(launch_angle_16),
                                   (((launch_angle * 0.45) + (launch_angle_17 * 0.35) + (launch_angle_16 * 0.2)) / 3),
                                   ifelse(!is.na(launch_angle) & !is.na(launch_angle_17) & is.na(launch_angle_16),
                                          ((launch_angle * 0.6) + (launch_angle_17 * 0.4)) / 2,
                                          ifelse(!is.na(launch_angle) & is.na(launch_angle_17) & !is.na(launch_angle_16),
                                                 ((launch_angle * 0.6) + (launch_angle_16 * 0.4)) / 2,
                                                 ifelse(is.na(launch_angle) & !is.na(launch_angle_17) & !is.na(launch_angle_16),
                                                        ((launch_angle_17 * 0.6) + (launch_angle_16 * 0.4)) / 2,
                                                        ifelse(!is.na(launch_angle) & is.na(launch_angle_17) & is.na(launch_angle_16),
                                                               launch_angle,
                                                               ifelse(is.na(launch_angle) & !is.na(launch_angle_17) & is.na(launch_angle_16),
                                                                      launch_angle_17,
                                                                      ifelse(is.na(launch_angle) & is.na(launch_angle_17) & !is.na(launch_angle_16),
                                                                             launch_angle_16, NA
                                                                      )
                                                               )
                                                        )
                                                 )
                                          )
                                   )
                             )


  })

  return(merge_df[c("player_id", "average_launch_speed", "average_launch_angle")])

}

Output

output <- combine.batting.stats_new(batters_16, batters_17, batters_18, playerID_map)

# RANDOM SAMPLE OF 10 ROWS
output[sample(nrow(output), 10),]

#      player_id average_launch_speed average_launch_angle
# 156     400091                   NA                   NA
# 984     501745                   NA                   NA
# 291     430001             91.20000            10.700000
# 1911    621020             29.03167             3.548333
# 596     453269                   NA                   NA
# 1723    596142             30.30333             4.211667
# 38      121358                   NA                   NA
# 1733    598265             30.07000             3.570000
# 1153    518886                   NA                   NA
# 591     453214                   NA                   NA

You may notice a very low result such as player (596142) which does align to your weighting:

sapply(list(batters_16, batters_17, batters_18), 
       function(df) subset(df, player_id == 596142)$launch_speed)
# [1] 92.7 90.8 90.2

((90.2*0.45) + (90.8*0.35) + (92.7*0.2))/3
# [1] 30.30333
Parfait
  • 104,375
  • 17
  • 94
  • 125