I'm looking to join two data frames, One has the Postcode and corresponding ip_start number and ip_end number and the other data frame has user level information at ip level (which i have converted to ip_number). So eventually i would like to have for each postcode count of user_id based on the join. I am using sqldf package for this but its taking time. Here's my code :
sqldf("
select distinct
postcode,
count(distinct b.user_id_64)
from xx as b
join mydata as a
on b.ip_address between a.ip_start and a.ip_end
group by 1")
This is the sample data for two data frames
Dput for the user level table is :
structure(list(user_id_64 = c(2.56665e+18, 7.15794e+18, 4.69526e+17,
2.33457e+18, 3.45775e+18, 3.45775e+18, 4.69526e+17, 4.29208e+18,
4.69526e+17, 4.38089e+18, 6.42538e+18, 2.674e+17, 7.92854e+18,
2.64035e+18, 6.86659e+18, 8.6558e+18, 3.8103e+18, 6.82306e+18,
7.02596e+18, 3.74238e+18, 5.71553e+18, 5.69603e+18, 8.38862e+18,
5.49038e+18, 2.84231e+18, 6.31951e+18, 5.77869e+18, 1.11989e+18,
1.27041e+18, 3.26287e+17, 8.46056e+18, 5.26405e+18, 3.26287e+17,
5.20846e+18, 9.08398e+18, 3.30178e+18, 6.18623e+18, 1.27041e+18,
7.56282e+18, 4.15629e+18, 7.7631e+18, 7.59264e+18, 7.56282e+18,
8.7482e+18, 5.16656e+18, 3.63422e+18, 6.29831e+18, 7.59264e+18,
2.50311e+18, 8.91334e+18, 6.77925e+18, 1.27041e+18, 8.31891e+18,
7.7631e+18, 2.10397e+18, 6.71511e+18, 3.99658e+18, 3.99658e+18,
5.49038e+18, 4.58468e+18, 3.99658e+18, 4.15572e+18, 3.26566e+18,
3.26287e+17, 2.39782e+17, 5.59039e+18, 3.52355e+17, 6.35331e+18,
6.35331e+18, 2.48015e+18, 4.15288e+18, 9.144e+18, 1.68929e+18,
1.58435e+18, 7.15768e+18, 6.97902e+18, 1.50188e+18, 1.16074e+18,
1.06608e+18, 6.83616e+18, 1.33174e+18, 2.08647e+18, 7.59264e+18,
6.60321e+18, 1.35883e+18, 6.60321e+18, 5.04704e+18, 4.58468e+18,
2.64539e+18, 4.58468e+18, 4.58468e+18, 7.66672e+18, 7.20964e+18,
7.75166e+18, 7.92436e+18, 2.32202e+18, 7.59264e+18, 3.30178e+18,
3.30178e+18, 8.79436e+18), segment_id = c(3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L,
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L
), ip_address = c(1599602944, 3646507008, 1053535166, 1354458624,
3167894016, 3167894016, 1053535166, 1338907392, 1053535166, 1831817728,
1573085952, 3287532800, 2401036288, 1340220928, 1357177856, 1050502656,
792868352, 1351343360, 3641064192, 1437891328, 3160759552, 1464856320,
1548261376, 3575098624, 3646631168, 1339071232, 1347486464, 1040773632,
1351790592, 1599791616, 1469657088, 1054948864, 1599791616, 2965843456,
1528858880, 621065472, 1550676776, 1351790592, 2313306624, 3108808192,
3273934592, 1380218249, 2313306624, 1413036288, 2227665152, 3564732928,
1486424320, 1380218112, 1575304448, 3646534144, 1419969280, 1351790592,
1599787264, 3273934592, 1437817088, 1527809536, 3657357824, 3657357824,
3575098624, 3166801408, 3657357824, 1599791104, 1601207808, 1599791616,
2508089856, 1351818496, 1519995904, 1419071488, 1419071488, 3261184000,
1302211584, 1472064768, 1573445120, 1351673856, 1352004352, 534593762,
2365983488, 1541224960, 3657205760, 1469483264, 1417133312, 1049678592,
1380218112, 3655424000, 1338765568, 3655424000, 1468009216, 3166801408,
1599791616, 3166801408, 3166801408, 3641061120, 1607520000, 1340260096,
3655680256, 1291976960, 1380218112, 621065472, 621065472, 1440235520
), imp_time1 = structure(c(1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400,
1452623400), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("user_id_64",
"segment_id", "ip_address", "imp_time1"), row.names = c(NA, 100L
), class = "data.frame")
Postcode table is :
postcode ip_start ip_end
1 60323 34604544 34605055
2 53804 44177088 44177151
3 84079 44421680 44421695
4 01640 46763776 46764031
5 27749 46862592 46862847
6 44534 47091968 47092223
7 25355 49320343 49320343
8 91058 49446912 49447101
9 90419 49488640 49488895
10 58089 49544576 49544703
11 58640 49593984 49594111
12 68723 49700736 49700863
13 40213 71663708 71663711
14 60388 83973649 83973650
15 91710 84477147 84477156
16 91710 84478915 84478916
17 91710 84480297 84480297
18 91710 84481923 84481924
19 91710 84482887 84482890
20 91710 84484917 84484917
21 91710 84485644 84485644
22 91710 84488196 84488196
23 91710 84489007 84489007
24 91710 84489443 84489443
25 91710 84490211 84490211
26 91710 84491214 84491214
27 91710 84492069 84492069
28 91710 84493253 84493253
29 91710 84494557 84494564
30 91710 84495989 84495989
31 91710 84496638 84496641
32 91710 84497550 84497550
33 91710 84498257 84498257
34 91710 84498901 84498901
35 91710 84500082 84500082
36 91710 84500558 84500558
37 91710 84501930 84501930
38 91710 84502729 84502729
39 91710 84504087 84504087
40 91710 84505002 84505002
41 91710 84506269 84506269
42 91710 84507441 84507441
43 91710 84509920 84509929
44 91710 84511088 84511093
45 91710 84511791 84511791
46 91710 84512746 84512746
47 91710 84514481 84514482
48 91710 84515526 84515526
49 91710 84516891 84516891
50 91710 84519596 84519597
Is there better way to join , maybe dplyr approach(i am not aware how to do it using this approach) let me know how to get this done.