1

I have two dataframes: (1) df_kader (df_kaderExample for you) with almost 76,000 observations. The observations show per club (club_id) which players (spieler_id) were in the squad of the club in the respective season (2010-2019). (2) df_injuries (df_verletzungenExample for you) with almost 60,000 observations. For each player (player_id), the observations show, among other things, the type of injury and the time period.

For my further processing I want to find out from when to when a player played in a club. The goal is to assign a club (club_id) to each observation from df_injuries (best possible).

Problem: Some players have belonged to several clubs per season. For example, a player can change in the middle of the season and therefore has two observations in the 2010 season (as seen in the example in the year 2015).

The following two ideas I pursue: 1.) I assign to each observation in df_kader whether the player has played in the club in the first and second half of the season or not. 2.) I try to use the available information of each observation in df_kader to assign the exact times from when to when the player has played in the club in the season concerned.

I think variant 1 will be easier but less accurate, variant 2 more accurate but maybe not possible with the available information.

For variant 1 I have developed an idea how to assign the first (hinrunde Boolean) and second (rueckrunde Boolean) half of the season. For this I use the following code to fill the columns for the first and second half of the season. As an example I have also added the data for a data set.

library(readxl)
library(tidyverse)
library(lubridate)

############################## (START) 
df_kader2010 <- subset(df_kaderExample, df_kaderExample$saison == 2015 & df_kaderExample$spieler_id == 46580)
df_kader2010 <- arrange(df_kader2010, df_kader2010$im_team_seit)
anzahlZeilen <- nrow(df_kader2010)

# Beim letzten Eintrag der Liste hat der Spieler auf jeden Fall in der Rückrunde gespielt
df_kader2010[anzahlZeilen, ]$rueckrunde <- TRUE
df_kader2010[anzahlZeilen, ]$gespielt_von <- df_kader2010[anzahlZeilen, ]$im_team_seit
df_kader2010[anzahlZeilen-1, ]$gespielt_bis <- df_kader2010[anzahlZeilen, ]$im_team_seit-1

# Bei allen anderen Einträgen hat er nicht in der Rückrunde gespielt
i <- anzahlZeilen-1
while(i > 0) {
  df_kader2010[i, ]$rueckrunde <- FALSE
  i <- i-1
}

if (month(df_kader2010[anzahlZeilen, ]$im_team_seit) == 12 | year(df_kader2010[anzahlZeilen, ]$im_team_seit) >= 2010) {
  df_kader2010[anzahlZeilen, ]$hinrunde <- FALSE
  df_kader2010[anzahlZeilen-1, ]$hinrunde <- TRUE
  
  i <- anzahlZeilen-2
  while(i > 0) {
    df_kader2010[i, ]$hinrunde <- FALSE
    i <- i-1
  }
} else {
  df_kader2010[anzahlZeilen, ]$hinrunde <- TRUE
  
  i <- anzahlZeilen-1
  while(i > 0) {
    df_kader2010[i, ]$hinrunde <- FALSE
    i <- i-1
  }
}
############################## (ENDE)

df_kaderExample

My concrete question to the professionals would be, how can I let my little algorithm work on all the observations? Do I have to use a classic double FOR loop? I think its not the best way doing something like this :)

for(saison in saisons) {
  
  for(spieler in alleSpieler) {
    
    MY ALGORITHM
    
  }
  
}

df_kaderExample

structure(list(verein_id = c(533, 533, 533, 533, 533, 1003, 1003
), verein_name = c("TSG 1899 Hoffenheim", "TSG 1899 Hoffenheim", 
"TSG 1899 Hoffenheim", "TSG 1899 Hoffenheim", "TSG 1899 Hoffenheim", 
"Leicester City", "Leicester City"), saison = c(2015, 2016, 2017, 
2018, 2019, 2014, 2015), spieler_id = c(46580, 46580, 46580, 
46580, 46580, 46580, 46580), rueckennummer = c(27, 27, 27, 27, 
27, 40, NA), im_team_seit = structure(list(sec = c(0, 0, 0, 0, 
0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 
0L, 0L, 0L, 0L, 0L), mday = c(20L, 1L, 1L, 1L, 1L, 16L, 16L), 
    mon = c(0L, 6L, 6L, 6L, 6L, 0L, 0L), year = c(116L, 116L, 
    116L, 116L, 116L, 115L, 115L), wday = c(3L, 5L, 5L, 5L, 5L, 
    5L, 5L), yday = c(19L, 182L, 182L, 182L, 182L, 15L, 15L), 
    isdst = c(0L, 1L, 1L, 1L, 1L, 0L, 0L), zone = c("CET", "CEST", 
    "CEST", "CEST", "CEST", "CET", "CET"), gmtoff = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_)), class = c("POSIXlt", "POSIXt"), tzone = ""), 
    vertrag_bis = structure(list(sec = c(NA, 0, 0, 0, 0, NA, 
    NA), min = c(NA, 0L, 0L, 0L, 0L, NA, NA), hour = c(NA, 0L, 
    0L, 0L, 0L, NA, NA), mday = c(NA, 30L, 30L, 30L, 30L, NA, 
    NA), mon = c(NA, 5L, 5L, 5L, 5L, NA, NA), year = c(NA, 122L, 
    122L, 122L, 122L, NA, NA), wday = c(NA, 4L, 4L, 4L, 4L, NA, 
    NA), yday = c(NA, 180L, 180L, 180L, 180L, NA, NA), isdst = c(-1L, 
    1L, 1L, 1L, 1L, -1L, -1L), zone = c("", "CEST", "CEST", "CEST", 
    "CEST", "", ""), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt", 
    "POSIXt"), tzone = ""), marktwert = c(7.5, 15, 27, 38, 30.5, 
    8, 7.5), hinrunde = c(NA, NA, NA, NA, NA, NA, NA), rueckrunde = c(NA, 
    NA, NA, NA, NA, NA, NA), gespielt_von = structure(list(sec = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
    ), min = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), hour = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_), mday = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), mon = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_), year = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), wday = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_), yday = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), isdst = c(-1L, 
    -1L, -1L, -1L, -1L, -1L, -1L), zone = c("", "", "", "", "", 
    "", ""), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt", 
    "POSIXt"), tzone = ""), gespielt_bis = structure(list(sec = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
    ), min = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), hour = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_), mday = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), mon = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_), year = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), wday = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_), yday = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), isdst = c(-1L, 
    -1L, -1L, -1L, -1L, -1L, -1L), zone = c("", "", "", "", "", 
    "", ""), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt", 
    "POSIXt"), tzone = "")), row.names = c(NA, -7L), class = c("tbl_df", 
"tbl", "data.frame"))

df_kaderExample

df_verletzungeExample

structure(list(spieler_id = c(46580, 46580, 46580, 46580, 46580, 
46580, 46580, 46580, 46580), saison = c(2019, 2019, 2019, 2019, 
2019, 2018, 2017, 2017, 2016), verletzung = c("Knieprobleme", 
"Leistenprobleme", "Leistenprobleme", "Knieprobleme", "Knieverletzung", 
"Innenbanddehnung Knie", "Blessur", "Entzündung der Fußsohle", 
"Schulterverletzung"), von = structure(list(sec = c(0, 0, 0, 
0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), hour = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(10L, 
17L, 8L, 7L, 17L, 1L, 14L, 3L, 20L), mon = c(4L, 1L, 1L, 10L, 
6L, 7L, 4L, 6L, 3L), year = c(120L, 120L, 120L, 119L, 119L, 118L, 
118L, 117L, 117L), wday = c(0L, 1L, 6L, 4L, 3L, 3L, 1L, 1L, 4L
), yday = c(130L, 47L, 38L, 310L, 197L, 212L, 133L, 183L, 109L
), isdst = c(1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L), zone = c("CEST", 
"CET", "CET", "CET", "CEST", "CEST", "CEST", "CEST", "CEST"), 
    gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
    )), class = c("POSIXlt", "POSIXt"), tzone = ""), bis = structure(list(
    sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L), mday = c(5L, 26L, 9L, 21L, 7L, 27L, 17L, 20L, 27L
    ), mon = c(5L, 1L, 1L, 10L, 9L, 7L, 4L, 6L, 3L), year = c(120L, 
    120L, 120L, 119L, 119L, 118L, 118L, 117L, 117L), wday = c(5L, 
    3L, 0L, 4L, 1L, 1L, 4L, 4L, 4L), yday = c(156L, 56L, 39L, 
    324L, 279L, 238L, 136L, 200L, 116L), isdst = c(1L, 0L, 0L, 
    0L, 1L, 1L, 1L, 1L, 1L), zone = c("CEST", "CET", "CET", "CET", 
    "CEST", "CEST", "CEST", "CEST", "CEST"), gmtoff = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt", 
"POSIXt"), tzone = ""), tage = c(26, 9, 1, 14, 82, 26, 3, 17, 
7), verpasste_spiele = c(4, 1, 1, 1, 8, 2, NA, NA, 1), verein_id = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"))

df_verletzungeExample

edstrinova
  • 101
  • 1
  • 7
  • Just to clarify, you want to add the club to the second dataframe, i.e. `df_verletzungen`? You have the time range for the injury in that data range (`von`, `bis`) and you have the time range for the contract in the first data frame (`im_team_seit`, `vertrag_bis`). This sounds like a merge on a date range, which doesn't make it easier, but thats been asked and answered in various forms before. – coffeinjunky Nov 08 '20 at 01:19
  • Here is an example: https://stackoverflow.com/questions/23095896/merging-two-dataframes-on-a-date-range-in-r and also here: https://community.rstudio.com/t/tidy-way-to-range-join-tables-on-an-interval-of-dates/7881 – coffeinjunky Nov 08 '20 at 01:24
  • I do not have the time range. The column "vertrag_bis" isn't correct. It only shows the current state, unfortunately not the past. You just can ignore the column "vertrag_bis". So a simple merge is not possible. – edstrinova Nov 08 '20 at 08:00

0 Answers0