I have a fairly large adjacency matrix and only want to keep the relationships that have had at least 5 transactions with each other. How would you do this? Would it make sense to assign 0 to all values less than 5 or is there a more sensible approach?
Should I then receive the new adjacency matrix, how can I then have the relationships output to me as a list in which each ID is output with the associated "partners".
Many thanks for your help :)!
Here is my code for the adjacency matrix so far:
dd <- head(newdata, 50000)
colnames(dd) <- c("MEMBER_ID","AUTHOR_ID")
x <- xtabs(~MEMBER_ID+AUTHOR_ID, dd)
mm <- crossprod(x,x)
mm[lower.tri(mm, TRUE)] <- NA
Here is a View()
of the result in RStudio.
Thats what I would like to have for each ID pair of my dataset.
For completion, here is a reproducible sample of my original data
SubsMain
:
# > dput(head(SubsMAIN, 100))
structure(list(MEMBER_ID = c(199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781,
199781, 199781, 199781, 199781, 199781, 199781, 199781, 199781
), RATING = c(5, 5, 5, 3, 5, 5, 4, 5, 3, 4, 5, 5, 5, 3, 4, 4,
2, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 3, 5, 4, 5, 4, 4, 3, 3, 2, 5,
3, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 4, 4, 5, 5, 5, 3,
4, 4, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 4, 4, 5,
5, 4, 4, 5, 5, 4, 5, 3, 5, 3, 5, 5, 5, 2, 3, 5, 5, 3, 5, 4, 3
), AUTHOR_ID = c(258195, 201494, 409591, 1964674948, 284187,
641414, 686042, 531975, 1892323204, 362579, 301950, 2988937092,
205270, 353623, 657993, 2418118532, 590804, 222936, 216022, 2320404356,
199862, 538993, 290046, 234885, 417532, 1705021316, 216430, 1320783748,
301950, 2012450692, 3267006340, 321415, 213839, 1967230852, 519301,
1880919940, 409850, 617204, 262004, 200165, 3267006340, 345500,
1711443844, 290046, 238184, 241451, 452301, 301950, 205491, 212098,
241578, 2367524740, 2366410628, 225252, 2988937092, 1789300612,
1965068164, 432146, 2151190404, 1772130180, 290046, 203622, 210929,
243427, 205705, 301950, 2551549828, 2250674052, 1378848644, 298157,
1873186692, 526355, 231243, 2988937092, 241578, 547653, 1301319556,
1956417412, 292382, 2571341700, 421709, 2309066628, 256232, 214201,
447962, 278848, 2533396356, 328874, 1955106692, 262822, 1568706436,
458913, 217003, 583640, 307259, 199780, 1836027780, 235786, 2366279556,
358714), STATUS = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), CREATION = c("2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10",
"2001/01/10", "2001/01/10", "2001/01/10", "2001/01/10"), LAST_MODIFIED = c("2001/03/24",
"2001/08/25", "2002/12/02", "2001/03/29", "2002/03/22", "2002/04/22",
"2001/01/22", "2001/11/15", "2001/04/10", "2001/03/24", "2001/04/03",
"2001/10/11", "2001/05/08", "2001/03/07", "2002/01/26", "2002/03/10",
"2001/03/24", "2001/03/25", "2001/01/28", "2001/09/06", "2001/05/22",
"2001/05/03", "2001/01/18", "2001/10/26", "2002/01/09", "2001/08/21",
"2001/02/09", "2001/03/14", "2002/03/22", "2001/03/19", "2001/02/10",
"2001/01/19", "2001/02/09", "2001/09/28", "2001/01/19", "2001/01/31",
"2001/03/19", "2001/01/31", "2001/02/09", "2001/03/07", "2001/08/10",
"2001/09/29", "2001/07/31", "2001/06/20", "2001/07/03", "2001/09/12",
"2001/03/30", "2002/05/07", "2002/08/10", "2002/02/23", "2001/09/06",
"2001/03/19", "2001/10/30", "2001/01/29", "2001/04/28", "2001/11/17",
"2002/02/23", "2001/03/15", "2001/10/28", "2001/01/31", "2001/06/12",
"2003/08/06", "2002/01/09", "2001/08/30", "2001/12/22", "2001/08/21",
"2001/04/16", "2001/11/15", "2002/05/03", "2001/03/15", "2001/08/29",
"2001/09/12", "2001/11/17", "2001/10/04", "2001/08/20", "2001/08/21",
"2001/11/17", "2003/08/06", "2001/04/03", "2001/07/22", "2001/02/11",
"2001/09/12", "2001/07/03", "2001/05/11", "2002/01/09", "2001/03/05",
"2001/07/10", "2003/06/25", "2001/02/18", "2001/03/27", "2001/06/06",
"2002/08/11", "2001/04/27", "2001/02/18", "2001/08/22", "2002/02/23",
"2001/10/30", "2001/07/03", "2001/06/04", "2003/04/28")), row.names = c(NA,
100L), class = "data.frame")