13

I have two large datasets, df1 and df2. The first dataset, df1, contains the columns 'ID' and 'actual.data'.

df1 <- data.frame(ID=c(1,1,1,2,3,4,4), actual.date=c('10/01/1997','2/01/1998','5/01/2002','7/01/1999','9/01/2005','5/01/2006','2/03/2003'));
dcis <- grep('date$',names(df1));
df1[dcis] <- lapply(df1[dcis],as.Date,'%m/%d/%Y');
df1;

   ID actual.date
1  1  1997-10-01
2  1  1998-02-01
3  1  2002-05-01
4  2  1999-07-01
5  3  2005-09-01
6  4  2006-05-01
7  4  2003-02-03

The second dataset, df2, contains two date fields, 'before,date' and 'after.date', which represent a start and end date, respectively:

df2 <- data.frame(ID=c(1,1,1,2,3,4,4,4), before.date=c('10/1/1996','1/1/1998','1/1/2000','1/1/2001','1/1/2001','1/1/2001','10/1/2004','10/3/2004'), after.date=c('12/1/1996','9/30/2003','12/31/2004','3/31/2006','9/30/2006','9/30/2005','12/30/2004','11/28/2004') );
dcis <- grep('date$',names(df2));
df2[dcis] <- lapply(df2[dcis],as.Date,'%m/%d/%Y');
df2;

   ID before.date after.date
1  1  1996-10-01 1996-12-01
2  1  1998-01-01 2003-09-30
3  1  2000-01-01 2004-12-31
4  2  2001-01-01 2006-03-31
5  3  2001-01-01 2006-09-30
6  4  2001-01-01 2005-09-30
7  4  2004-10-01 2004-12-30
8  4  2004-10-03 2004-11-28

My goal is to create a new column at the end of df1 named 'match' that indicates whether the 'actual.date' for each row in df1 is between the ‘before.date’ and ‘after.date’ of any of the observations of the same ID in df2. If it is between, I want to give the 'match' column a value of 1, otherwise 0 (includes instances with no 'ID' match).

This is the output I am hoping for:

   ID before.date match
1  1  1997-10-01 0
2  1  1998-02-01 1
3  1  2002-05-01 1
4  2  1999-07-01 0
5  3  2005-09-01 1
7  4  2006-05-01 0
8  4  2003-02-03 1

I think this can be done with a for() loop, but I am not too knowledgeable about R.

Sample Data:

    DF1

structure(list(cikcode = c("20", "20", "20", "20", "20", "20",
"20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20",
"20", "20", "1750"), auditorkey = c("4", "4", "5", "5", "5",
"5", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"6", "4"), yearendeddate = structure(c(4L, 4L, 2L, 2L, 3L, 3L,
5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 1L), .Label = c("2000-05-31",
"2000-12-30", "2001-12-29", "2002-12-28", "2004-01-03", "2005-01-01",
"2005-12-31", "2006-12-30", "2007-12-29", "2009-01-03", "2010-01-02"
), class = "factor"), source = structure(c(1L, 3L, 1L, 3L, 2L,
3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L), .Label = c("10-K",
"10-K405", "DEF 14A"), class = "factor"), sourcedate = structure(c(6L,
7L, 2L, 3L, 4L, 5L, 8L, 9L, 10L, 20L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 1L), .Label = c("2000-08-24", "2001-03-26",
"2001-03-28", "2002-03-20", "2002-03-25", "2003-03-27", "2003-03-31",
"2004-04-01", "2004-04-06", "2005-03-31", "2006-03-23", "2006-03-28",
"2007-03-09", "2007-03-27", "2008-03-12", "2008-04-04", "2009-03-13",
"2009-04-06", "2010-03-15", "2005-04-04"), class = "factor"),
    financialsdate = structure(c(4L, 4L, 2L, 2L, 3L, 3L, 5L,
    5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 1L), .Label = c("2000-05-31",
    "2000-12-30", "2001-12-29", "2002-12-28", "2004-01-03", "2005-01-01",
    "2005-12-31", "2006-12-30", "2007-12-29", "2009-01-03", "2010-01-02"
    ), class = "factor"), auditopinionkey = c("3538", "NA", "66900",
    "NA", "78252", "NA", "39225", "NA", "84748", "NA", "102979",
    "NA", "120889", "NA", "148621", "NA", "171604", "NA", "192814",
    "156138"), auditorstatecode = structure(c(2L, NA, 2L, NA,
    2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L,
    1L), .Label = c("IL", "PA"), class = "factor"), auditorstatename = structure(c(2L,
    NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA,
    2L, NA, 2L, 1L), .Label = c("ILLINOIS", "PENNSYLVANIA"), class = "factor"),
    goingconcern = structure(c(1L, NA, 1L, NA, 1L, NA, 1L, NA,
    1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, 1L), .Label = "No", class = "factor"),
    goingconcernissuekeylist = structure(c(1L, NA, 1L, NA, 1L,
    NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, 1L
    ), .Label = "", class = "factor"), goingconcernissuephraselist = structure(c(1L,
    NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA,
    1L, NA, 1L, 1L), .Label = "", class = "factor"), isadditionalopinion = structure(c(1L,
    NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA,
    1L, NA, 1L, 1L), .Label = "No", class = "factor"), restatement = c("NA",
    "0", "NA", "0", "NA", "0", "NA", "0", "NA", "0", "NA", "0",
    "NA", "0", "NA", "0", "NA", "0", "0", "NA"), yearended = c("NA",
    "2002", "NA", "2000", "NA", "2001", "NA", "2003", "NA", "2004",
    "NA", "2005", "NA", "2006", "NA", "2007", "NA", "2008", "2009",
    "NA"), assets = c("50,459,000", "50,459,000", "54,421,000",
    "54,421,000", "47,644,000", "47,644,000", "83,081,000", "83,081,000",
    "93,016,000", "93,016,000", "89,110,000", "89,110,000", "140,996,000",
    "140,996,000", "184,118,000", "184,118,000", "199,444,000",
    "199,444,000", "204,236,000", "740,998,000"), auditfees = c("123,700",
    "123,700", "200,000", "200,000", "185,000", "185,000", "137,100",
    "137,100", "225,000", "225,000", "244,000", "244,000", "574,000",
    "574,000", "669,000", "669,000", "680,000", "680,000", "643,000",
    "NA"), auditor = c("KPMG LLP", "KPMG LLP", "Arthur Andersen LLP",
    "Arthur Andersen LLP", "Arthur Andersen LLP", "Arthur Andersen LLP",
    "Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
    "Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
    "Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
    "Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
    "Grant Thornton LLP", "KPMG LLP"), earnings = c("3,284,000",
    "3,284,000", "5,838,000", "5,838,000", "1,048,000", "1,048,000",
    "NA", "", "NA", "", "NA", "", "NA", "", "21,321,000", "21,321,000",
    "25,773,000", "25,773,000", "21,555,000", "35,163,000"),
    naicscode = c("334513", "334513", "334513", "334513", "334513",
    "334513", "334513", "334513", "334513", "334513", "334513",
    "334513", "334513", "334513", "334513", "334513", "334513",
    "334513", "334513", "334613"), revenue = c("68,231,000",
    "68,231,000", "84,912,000", "84,912,000", "71,819,000", "71,819,000",
    "94,676,000", "94,676,000", "112,494,000", "112,494,000",
    "118,940,000", "118,940,000", "148,127,008", "148,127,008",
    "201,677,000", "201,677,000", "243,018,000", "243,018,000",
    "190,774,000", "1,024,333,000"), siccode = c("3823", "3823",
    "3823", "3823", "3823", "3823", "3823", "3823", "3823", "3823",
    "3823", "3823", "3823", "3823", "3823", "3823", "3823", "3823",
    "3823", "3720"), statecode = c("NJ", "NJ", "NJ", "NJ", "NJ",
    "NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ",
    "NJ", "NJ", "NJ", "NJ", "IL"), statename = c("NEW JERSEY",
    "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY",
    "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY",
    "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY",
    "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "ILLINOIS"), ticker = c("NA",
    "", "NA", "", "NA", "", "NA", "", "NA", "", "NA", "", "NA",
    "", "NA", "", "NA", "", "", "AIR")), .Names = c("cikcode",
"auditorkey", "yearendeddate", "source", "sourcedate", "financialsdate",
"auditopinionkey", "auditorstatecode", "auditorstatename", "goingconcern",
"goingconcernissuekeylist", "goingconcernissuephraselist", "isadditionalopinion",
"restatement", "yearended", "assets", "auditfees", "auditor",
"earnings", "naicscode", "revenue", "siccode", "statecode", "statename",
"ticker"), row.names = c(NA, 20L), class = "data.frame")


DF2

structure(list(cikcode = c(320193L, 72971L, 72971L, 200406L,
40545L, 40545L, 1114448L, 19617L, 19617L, 1067983L, 70858L, 313807L,
1578845L, 1113172L, 64803L, 1135644L, 731766L, 14272L, 14272L,
66740L), auditoratdisclosuredate = c("KPMG LLP", "KPMG LLP",
"KPMG LLP", "PricewaterhouseCoopers LLP", "KPMG LLP", "KPMG LLP",
"PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP",
"Deloitte & Touche LLP", "PricewaterhouseCoopers LLP", "Ernst & Young LLP",
"PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP", "KPMG LLP",
"KPMG LLP", "Deloitte & Touche LLP", "PricewaterhouseCoopers LLP",
"PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP"),
    auditoratdisclosuredatekey = c("4", "4", "4", "1", "4", "4",
    "1", "1", "1", "3", "1", "2", "1", "1", "4", "4", "3", "1",
    "1", "1"), auditorduringrestatedperiod = c("|KPMG LLP|",
    "|KPMG LLP|", "|KPMG LLP|", "|PricewaterhouseCoopers LLP|",
    "|KPMG LLP|", "|KPMG LLP|", "|PricewaterhouseCoopers LLP|PricewaterhouseCoopers LLP (PricewaterhouseCoopers AG, SWITZERLAND)|",
    "|PricewaterhouseCoopers LLP|", "|PricewaterhouseCoopers LLP|",
    "|Deloitte & Touche LLP|", "|PricewaterhouseCoopers LLP|PricewaterhouseCoopers LLP (Price Waterhouse & Co SRL, ARGENTINA)|",
    "|Ernst & Young LLP|Ernst & Young LLP (Ernst & Young LLP, UNITED KINGDOM)|",
    "|PricewaterhouseCoopers LLP|", "|Deloitte & Touche LLP (Deloitte Touche Tohmatsu Auditores Independentes, BRAZIL)|Deloitte & Touche LLP (KPMG Auditores Independentes, BRAZIL)|KPMG LLP (Deloitte Touche Tohmatsu Auditores Independentes, BRAZIL)|KPMG LLP (KPMG Auditores Independentes, BRAZIL)|",
    "|KPMG LLP|", "|KPMG LLP (KPMG Deutsche Treuhand-Gesellschaft AG WPG, GERMANY)|",
    "|Arthur Andersen LLP|Deloitte & Touche LLP|", "|PricewaterhouseCoopers LLP|",
    "|PricewaterhouseCoopers LLP|", "|PricewaterhouseCoopers LLP|"
    ), auditorduringrestatedperiodkeys = c("|4|", "|4|", "|4|",
    "|1|", "|4|", "|4|", "|1|", "|1|", "|1|", "|3|", "|1|", "|2|",
    "|1|", "|3|4|", "|4|", "|4|", "|5|3|", "|1|", "|1|", "|1|"
    ), auditoropinionperiodendduringrestatedperiod = c("|KPMG LLP|",
    "|KPMG LLP|", "|KPMG LLP|", "", "|KPMG LLP|", "|KPMG LLP|",
    "|PricewaterhouseCoopers LLP[PricewaterhouseCoopers AG]|",
    "|PricewaterhouseCoopers LLP|", "", "", "|PricewaterhouseCoopers LLP|",
    "|Ernst & Young LLP[Ernst & Young LLP]|", "|PricewaterhouseCoopers LLP|",
    "|KPMG LLP[KPMG Auditores Independentes]|", "", "|KPMG LLP[KPMG Deutsche Treuhand-Gesellschaft AG WPG]|",
    "|Arthur Andersen LLP|Deloitte & Touche LLP|", "|PricewaterhouseCoopers LLP|",
    "|PricewaterhouseCoopers LLP|", "|PricewaterhouseCoopers LLP|"
    ), auditoropinionperiodendduringrestatedperiodkeys = c("|4|",
    "|4|", "|4|", "", "|4|", "|4|", "|1|", "|1|", "", "", "|1|",
    "|2|", "|1|", "|4|", "", "|4|", "|5|3|", "|1|", "|1|", "|1|"
    ), restatementkey = c("10196", "12617", "42632", "44280",
    "857", "22310", "7995", "10187", "38972", "5379", "6357",
    "9495", "49566", "32005", "45575", "15890", "13410", "695",
    "3752", "36923"), restatedperiodbegin = c("1996-10-01", "1998-01-01",
    "2005-01-01", "2012-01-01", "2001-01-01", "2001-01-01", "2003-01-01",
    "2003-01-01", "2012-01-01", "2001-07-01", "2001-01-01", "2000-01-01",
    "2014-01-01", "2007-01-01", "1997-01-01", "2000-10-01", "1994-01-01",
    "2001-01-01", "1999-01-01", "2010-01-01"), restatedperiodended = c("2006-04-01",
    "2003-09-30", "2007-09-30", "2012-07-01", "2006-03-31", "2006-09-30",
    "2004-12-31", "2006-03-31", "2012-03-31", "2001-09-30", "2005-09-30",
    "2004-12-31", "2014-12-31", "2008-12-31", "1998-12-31", "2006-09-30",
    "2006-09-30", "2003-09-30", "2002-06-30", "2011-03-31"),
    disclosure = c("Press Release", "10-Q/A", "10-K", "10-Q",
    "8-K", "8-K", "20-F", "8-K", "8-K", "10-Q/A", "8-K", "20-F/A",
    "10-Q", "20-F", "8-K", "20-F", "8-K", "8-K", "Press Release",
    "10-Q"), disclosuredate = c("2006-06-29", "2004-01-16", "2008-02-29",
    "2012-11-09", "2005-05-06", "2007-01-19", "2006-01-30", "2006-08-03",
    "2012-07-13", "2002-03-15", "2006-02-22", "2006-06-13", "2015-05-11",
    "2010-04-21", "1999-11-15", "2006-12-11", "2006-04-07", "2004-01-29",
    "2002-10-22", "2011-08-04"), cumulativechangeinnetincome = c(NaN,
    NaN, 0, 0, 3.04344437838208e-315, NaN, NaN, 0, NaN, 0, 2.66795448754273e-315,
    0, 0, 0, 0, NaN, NaN, NaN, NaN, 0), accountingrulegaapfasbapplicationfailures = c("|Deferred, stock-based and/or executive comp issues|Deferred, stock-based options backdating only (subcategory)|",
    "|Lease,  SFAS 5, legal, contingency and commitment issues|Lease, leasehold and FAS 13 (98) only (subcategory)|",
    "|Cash flow statement (SFAS 95) classification errors|Lease,  SFAS 5, legal, contingency and commitment issues|",
    "|Cash flow statement (SFAS 95) classification errors|",
    "|Financial derivatives/hedging (FAS 133) acct issues|",
    "|Financial derivatives/hedging (FAS 133) acct issues|Foreign, related party, affiliated, or subsidiary issues|",
    "|PPE intangible or fixed asset (value/diminution) issues|Acquisitions, mergers, disposals, re-org acct  issues|Consolidation issues incl Fin 46 variable interest  & off-B/S|Deferred, stock-based and/or executive comp issues|Tax expense/benefit/deferral/other (FAS 109) issues|GAAP - Changes in Acct Principles FASB/EITF or Foreign GAAP|Acquisitions, mergers, only (subcategory) acct issues|PPE issues - Intangible assets, goodwill only (subcategory)|Pension and other post-retirement benefit issues|",
    "|Accounts/loans receivable, investments & cash issues|Cash flow statement (SFAS 95) classification errors|",
    "|Revenue recognition issues|", "|Consolidation issues incl Fin 46 variable interest  & off-B/S|Intercompany, investment in  subs./affiliate issues|",
    "|Financial derivatives/hedging (FAS 133) acct issues|",
    "|Revenue recognition issues|Financial derivatives/hedging (FAS 133) acct issues|Inventory, vendor and/or cost of sales issues|GAAP - Changes in Acct Principles FASB/EITF or Foreign GAAP|",
    "|Debt, quasi-debt, warrants & equity ( BCF) security issues|Foreign, related party, affiliated, or subsidiary issues|Cash flow statement (SFAS 95) classification errors|Foreign, subsidiary only issues (subcategory)|",
    "", "|Acquisitions, mergers, disposals, re-org acct  issues|Acquisitions, mergers, only (subcategory) acct issues|",
    "|Tax expense/benefit/deferral/other (FAS 109) issues|",
    "|Deferred, stock-based and/or executive comp issues|Deferred, stock-based options backdating only (subcategory)|",
    "|Acquisitions, mergers, disposals, re-org acct  issues|Consolidation issues incl Fin 46 variable interest  & off-B/S|Accounts/loans receivable, investments & cash issues|Tax expense/benefit/deferral/other (FAS 109) issues|Fin Statement,  footnote & segment disclosure issues|Pension and other post-retirement benefit issues|",
    "|Revenue recognition issues|Accounts/loans receivable, investments & cash issues|Inventory, vendor and/or cost of sales issues|",
    "|Cash flow statement (SFAS 95) classification errors|"),
    financialfraudirregularitiesandmisrepresentations = c("",
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", ""), errorsaccountingandclericalapplications = c("",
    "", "", "", "", "", "", "", "", "", "", "", "", "|EPS, ratio and classification of income statement issues|",
    "", "", "", "", "", ""), othersignificantissues = c("", "",
    "", "", "", "", "", "", "|Z - Material Weakness - Section 404 or 302 issues identified|",
    "", "", "", "", "", "", "", "", "", "|Y - Loan covenant violations/issues|",
    ""), secinvestigation = c("", "", "Y", "", "Y", "Y", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "")), .Names = c("cikcode",
"auditoratdisclosuredate", "auditoratdisclosuredatekey", "auditorduringrestatedperiod",
"auditorduringrestatedperiodkeys", "auditoropinionperiodendduringrestatedperiod",
"auditoropinionperiodendduringrestatedperiodkeys", "restatementkey",
"restatedperiodbegin", "restatedperiodended", "disclosure", "disclosuredate",
"cumulativechangeinnetincome", "accountingrulegaapfasbapplicationfailures",
"financialfraudirregularitiesandmisrepresentations", "errorsaccountingandclericalapplications",
"othersignificantissues", "secinvestigation"), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"), class = "data.frame")
michael
  • 412
  • 1
  • 3
  • 13
  • 2
    You may use `foverlaps` from `data.table` – akrun Jul 11 '15 at 05:12
  • I get some errors by copy/pasting the dput i.e. `Error in structure(list(cikcode = c("20", "20", "20", "20", "20", "20", : object 'auditorkey' not found` I wonder why some are not quoted at the end? For e.g. `auditorkey`, `auditopinionkey`, ... – akrun Jul 12 '15 at 12:46
  • I tried to put quotes and then run it again. Now the error is `Error in structure(c(1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, : object 'K405' not found` Can you please run the dput again to see if everything is right – akrun Jul 12 '15 at 12:51
  • BTW, which are the date columns in both datasets that you are comparing? In `DF1`, there is `yearendeddate`, `sourcedate`, `financialdate`. In `DF2`, `restatedperiodbegin`, `restatedperiodended`, and `disclosuredate`. Also, which is common ID column/columns between these two datasets? – akrun Jul 12 '15 at 13:35
  • Quoting my comment below: "start date - restatedperiodbegin end date- restatedperiodended ID - cikcode actual date- yearendeddate" – michael Jul 12 '15 at 13:44
  • Thanks, but in the example you provided, the IDs don't have an overlap, so it will result in all `NA` for the `indx`. Also, I noticed that the column classes in both datasets are different for the 'Dates'. In one, it is 'character', while it is 'factor'. I would change the column classes to be the same. It also applied to 'cikcode'. BTW, I am not getting error with `foverlaps` code. – akrun Jul 12 '15 at 13:56
  • Can index handle it when there is NA? I thought I already changed everything in both datasets to characters. I'm unsure how to rewrite the code. Can foverlap still work with no overlap. I assumed in my dataset there are at least some occurrences with this. – michael Jul 12 '15 at 14:25
  • What I meant is that there is no overlaps for `cikcode`. I assumed that you want to do this for each `cikcode`. If the ID code is not a factor, then you can use the `start`, `end` columns after converting to 'Date' class. – akrun Jul 12 '15 at 14:27
  • This is what I did with your dput data, basically the same thing as in the post `df2 <- DF2[c('cikcode', 'restatedperiodbegin', 'restatedperiodended')];colnames(df2) <- c('ID', 'start', 'end'); df2[-1] <- lapply(df2[-1], as.Date);dt2 <- as.data.table(df2);setkeyv(dt2, colnames(dt2));df1 <- DF1[c('cikcode', 'yearendeddate', 'yearendeddate')];colnames(df1) <- colnames(df2); df1$ID <- as.integer(df1$ID);df1[-1] <- lapply(df1[,-1], as.Date);dt1 <- as.data.table(df1);setkeyv(dt1, colnames(dt1));indx <- foverlaps(dt1, dt2, type='within', which=TRUE, mult='first')` – akrun Jul 12 '15 at 14:31
  • well, for each cikcode in df2 i wanted to search for overlaps in start and end in df2. I'm really not sure if that is what you are referring to, but that is what I am trying to do before combining the data. However, looking for overlaps would not be necessary if actual date could scan all rows in df2 for its particular cikcode and return a value if it fits any of those rows – michael Jul 12 '15 at 20:35
  • @akrun im trying to convert factors and chararacters to dates. for instance, i try df2$restatedperiodbegin <- as.Date(df2$restatedperiodbegin, "%m/%d/%") but it deletes everything within restatedperiodbegin column – michael Jul 12 '15 at 23:17
  • Yes, I was doing that for each `cikcode`. Regarding your code, you are missing `%Y` if the date column is of the format `05/25/2005` – akrun Jul 13 '15 at 03:40

3 Answers3

10

You may use foverlaps from data.table. Convert both the data.frames to data.table with start/end columns. Set the key column as the column names of each dataset. Use foverlaps to get the numeric index which can be converted to binary match based on the NA values in it.

library(data.table)#v1.9.5+
dt1 <- data.table(ID=df1$ID, start=df1$actual.date, end=df1$actual.date)
setkeyv(dt1, colnames(dt1))
dt2 <- as.data.table(df2)
setnames(dt2, 2:3, c('start', 'end'))
setkeyv(dt2, colnames(dt2))
indx <- foverlaps(dt1, dt2, type='within', which=TRUE, mult='first')
dt1[, match:= +(!is.na(indx))][,end:=NULL]
setnames(dt1, 1:2, colnames(df1))
dt1
#   ID actual.date match
#1:  1  1997-10-01     0
#2:  1  1998-02-01     1
#3:  1  2002-05-01     1
#4:  2  1999-07-01     0
#5:  3  2005-09-01     1
#6:  4  2003-02-03     1
#7:  4  2006-05-01     0
Mus
  • 7,290
  • 24
  • 86
  • 130
akrun
  • 874,273
  • 37
  • 540
  • 662
  • does this solution incorporate an outer join of the data? – michael Jul 12 '15 at 02:20
  • something went wrong. I got the error messages: 1)Error in foverlaps(dt1, dt2, type = "within", which = TRUE, mult = "first") : length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths. 2)Error in setnames(dt1, 1:2, colnames(merged_combined_files12)) : 'old' is length 2 but 'new' is length 25. I looked at my data and there are only three columns showing: 'ID', 'start', and 'match'. And for 'match' there is a value of 1 for every single row – michael Jul 12 '15 at 03:50
  • @michael If you look at both the datasets `dt1` and `dt2`, the column names are the same. I am using the `devel` version of `data.table`. Did you get the error with the example data (which worked fine for me)? – akrun Jul 12 '15 at 05:50
  • yeah, but i get the error using my data. I was going to use dput but I forgot how to use it and you deleted your comment in the other post – michael Jul 12 '15 at 11:49
  • Error in foverlaps(dt1, dt2, type = "within", which = TRUE, mult = "first") : length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths. > dt1[, match:= +(!is.na(indx))][,end:=NULL] >setnames(dt1, 1:2, colnames(merged_combined_files12)) Error in setnames(dt1, 1:2, colnames(merged_combined_files12)) : 'old' is length 2 but 'new' is length 25 – michael Jul 12 '15 at 11:54
  • @michael The command is `dput(droplevels(head(yourdata,20)))` for each datasets – akrun Jul 12 '15 at 11:59
  • do i need to move the columns to the end of my real dataset? i noticed that foverlaps requires this. – michael Jul 12 '15 at 12:14
  • before you included columns in the dput function as well – michael Jul 12 '15 at 12:16
  • @michael How many columns do you have in the original dataset?. We only need the `start`, `end` and also the ID columns as in my post – akrun Jul 12 '15 at 12:20
  • movetolast <- function(data, move) { data[c(setdiff(names(data), move), move)] } movetolast(df, c("before.date", "end.date")) – michael Jul 12 '15 at 12:25
  • i have varying column lengths in d1 and d2. I need all variables from d1 and d2 in the join. I want it to be an outer join so using something like foverlaps(df1, df2, type = "within", nomatch =NA) would be better – michael Jul 12 '15 at 12:29
  • @michael In the example you showed also, it has varying rows. `8` vs `7`. What I meant is that you need to get only `match` column, for which you need `start, `end` and possibly `ID` columns. The end results can be joined later with the original datasets – akrun Jul 12 '15 at 12:30
  • i have included sample data – michael Jul 12 '15 at 12:39
  • start - restatedperiodbegin end - restatedperiodended ID - cikcode actual- yearendeddate – michael Jul 12 '15 at 12:43
3

Here is a solution with dplyr

library(dplyr)
dat <- inner_join(df1, df2, by = "ID")
dat %>% rowwise() %>%
        mutate(match = ifelse(between(actual.date, before.date, after.date), 1, 0)) %>%
        select(-c(before.date, after.date)) %>%
        arrange(actual.date, desc(match)) %>%
        distinct(actual.date)

the output is slightly different because it order the actual.date, maybe this is a problem, I'll delete my solution if the case.

Source: local data frame [7 x 3]

  ID actual.date match
1  1  1997-10-01     0
2  1  1998-02-01     1
3  2  1999-07-01     0
4  1  2002-05-01     1
5  4  2003-02-03     1
6  3  2005-09-01     1
7  4  2006-05-01     0
SabDeM
  • 7,050
  • 2
  • 25
  • 38
  • does this solution incorporate an outer join of the data? – michael Jul 12 '15 at 02:21
  • @michael it join the data, I've used the `inner_join` function. I do know... when I developed the solution it seemed an easy way. – SabDeM Jul 12 '15 at 23:04
  • Unrelated, but do you have any idea why the data in my column is deleted when trying to convert convert factors and chararacters to dates? for instance, i try df2$restatedperiodbegin <- as.Date(df2$restatedperiodbegin, "%m/%d/%") – michael Jul 12 '15 at 23:18
  • I get the following error `Error in mutate_impl(.data, dots) : Evaluation error: Expecting a single value: [extent=170]..` when I use `between` and `dplyr` together. Basically your this statement does not work in the new version of dplyr. `mutate(match = ifelse(between(actual.date, before.date, after.date), 1, 0))` – Lazarus Thurston Aug 20 '17 at 12:03
0

Just another hopefully correct answer using the fuzzyjoin package.

library(data.table)
library(fuzzyjoin)
dt1 <- data.table(df1)
dt2 <- data.table(df2)

fuzzy_left_join(dt1
                , dt2, 
                by = c("ID" = "ID", "actual.date" = "before.date", "actual.date" = "after.date"), 
                match_fun = list(`==`, `>`, `<`))[,.(ID = ID.x
                                                     ,actual.date
                                                     , match = ifelse(is.na(ID.y),0,1))]
hannes101
  • 2,410
  • 1
  • 17
  • 40