1

I'm looking to join two data frames, One has the Postcode and corresponding ip_start number and ip_end number and the other data frame has user level information at ip level (which i have converted to ip_number). So eventually i would like to have for each postcode count of user_id based on the join. I am using sqldf package for this but its taking time. Here's my code :

sqldf("
select distinct
postcode,
count(distinct b.user_id_64)
from  xx as b 
join  mydata as a
on b.ip_address between a.ip_start and a.ip_end
group by 1")

This is the sample data for two data frames

Dput for the user level table is :

structure(list(user_id_64 = c(2.56665e+18, 7.15794e+18, 4.69526e+17, 
2.33457e+18, 3.45775e+18, 3.45775e+18, 4.69526e+17, 4.29208e+18, 
4.69526e+17, 4.38089e+18, 6.42538e+18, 2.674e+17, 7.92854e+18, 
2.64035e+18, 6.86659e+18, 8.6558e+18, 3.8103e+18, 6.82306e+18, 
7.02596e+18, 3.74238e+18, 5.71553e+18, 5.69603e+18, 8.38862e+18, 
5.49038e+18, 2.84231e+18, 6.31951e+18, 5.77869e+18, 1.11989e+18, 
1.27041e+18, 3.26287e+17, 8.46056e+18, 5.26405e+18, 3.26287e+17, 
5.20846e+18, 9.08398e+18, 3.30178e+18, 6.18623e+18, 1.27041e+18, 
7.56282e+18, 4.15629e+18, 7.7631e+18, 7.59264e+18, 7.56282e+18, 
8.7482e+18, 5.16656e+18, 3.63422e+18, 6.29831e+18, 7.59264e+18, 
2.50311e+18, 8.91334e+18, 6.77925e+18, 1.27041e+18, 8.31891e+18, 
7.7631e+18, 2.10397e+18, 6.71511e+18, 3.99658e+18, 3.99658e+18, 
5.49038e+18, 4.58468e+18, 3.99658e+18, 4.15572e+18, 3.26566e+18, 
3.26287e+17, 2.39782e+17, 5.59039e+18, 3.52355e+17, 6.35331e+18, 
6.35331e+18, 2.48015e+18, 4.15288e+18, 9.144e+18, 1.68929e+18, 
1.58435e+18, 7.15768e+18, 6.97902e+18, 1.50188e+18, 1.16074e+18, 
1.06608e+18, 6.83616e+18, 1.33174e+18, 2.08647e+18, 7.59264e+18, 
6.60321e+18, 1.35883e+18, 6.60321e+18, 5.04704e+18, 4.58468e+18, 
2.64539e+18, 4.58468e+18, 4.58468e+18, 7.66672e+18, 7.20964e+18, 
7.75166e+18, 7.92436e+18, 2.32202e+18, 7.59264e+18, 3.30178e+18, 
3.30178e+18, 8.79436e+18), segment_id = c(3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 
3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L, 3446169L
), ip_address = c(1599602944, 3646507008, 1053535166, 1354458624, 
3167894016, 3167894016, 1053535166, 1338907392, 1053535166, 1831817728, 
1573085952, 3287532800, 2401036288, 1340220928, 1357177856, 1050502656, 
792868352, 1351343360, 3641064192, 1437891328, 3160759552, 1464856320, 
1548261376, 3575098624, 3646631168, 1339071232, 1347486464, 1040773632, 
1351790592, 1599791616, 1469657088, 1054948864, 1599791616, 2965843456, 
1528858880, 621065472, 1550676776, 1351790592, 2313306624, 3108808192, 
3273934592, 1380218249, 2313306624, 1413036288, 2227665152, 3564732928, 
1486424320, 1380218112, 1575304448, 3646534144, 1419969280, 1351790592, 
1599787264, 3273934592, 1437817088, 1527809536, 3657357824, 3657357824, 
3575098624, 3166801408, 3657357824, 1599791104, 1601207808, 1599791616, 
2508089856, 1351818496, 1519995904, 1419071488, 1419071488, 3261184000, 
1302211584, 1472064768, 1573445120, 1351673856, 1352004352, 534593762, 
2365983488, 1541224960, 3657205760, 1469483264, 1417133312, 1049678592, 
1380218112, 3655424000, 1338765568, 3655424000, 1468009216, 3166801408, 
1599791616, 3166801408, 3166801408, 3641061120, 1607520000, 1340260096, 
3655680256, 1291976960, 1380218112, 621065472, 621065472, 1440235520
), imp_time1 = structure(c(1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 1452623400, 
1452623400), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("user_id_64", 
"segment_id", "ip_address", "imp_time1"), row.names = c(NA, 100L
), class = "data.frame")

Postcode table is :

 postcode ip_start   ip_end
1     60323 34604544 34605055
2     53804 44177088 44177151
3     84079 44421680 44421695
4     01640 46763776 46764031
5     27749 46862592 46862847
6     44534 47091968 47092223
7     25355 49320343 49320343
8     91058 49446912 49447101
9     90419 49488640 49488895
10    58089 49544576 49544703
11    58640 49593984 49594111
12    68723 49700736 49700863
13    40213 71663708 71663711
14    60388 83973649 83973650
15    91710 84477147 84477156
16    91710 84478915 84478916
17    91710 84480297 84480297
18    91710 84481923 84481924
19    91710 84482887 84482890
20    91710 84484917 84484917
21    91710 84485644 84485644
22    91710 84488196 84488196
23    91710 84489007 84489007
24    91710 84489443 84489443
25    91710 84490211 84490211
26    91710 84491214 84491214
27    91710 84492069 84492069
28    91710 84493253 84493253
29    91710 84494557 84494564
30    91710 84495989 84495989
31    91710 84496638 84496641
32    91710 84497550 84497550
33    91710 84498257 84498257
34    91710 84498901 84498901
35    91710 84500082 84500082
36    91710 84500558 84500558
37    91710 84501930 84501930
38    91710 84502729 84502729
39    91710 84504087 84504087
40    91710 84505002 84505002
41    91710 84506269 84506269
42    91710 84507441 84507441
43    91710 84509920 84509929
44    91710 84511088 84511093
45    91710 84511791 84511791
46    91710 84512746 84512746
47    91710 84514481 84514482
48    91710 84515526 84515526
49    91710 84516891 84516891
50    91710 84519596 84519597

Is there better way to join , maybe dplyr approach(i am not aware how to do it using this approach) let me know how to get this done.

CuriousBeing
  • 1,592
  • 14
  • 34
PSraj
  • 229
  • 4
  • 10

0 Answers0