coordinates = df0.registerTempTable('coordinates') #5555 rows
metro_table = df1.registerTempTable('metro_table') #272 rows
I'm trying to find the closest points to the metro station for that I implement this SQL query but I don't understand why Spark does not accept the INNER JOIN !!! And my second question is how to return the first row only !! I used fetch first 1 rows only but it gives me an indentation error
query = "SELECT uuid,\
latitude,\
longitude,\
p.station_id,\
p.xlat,\
p.xlong,\
p.type_train,\
p.id_transport,\
6371000* DEGREES(ACOS(COS(RADIANS(p.xlat))\
* COS(RADIANS(latitude))\
* COS(RADIANS(p.xlong) - RADIANS(longitude))\
+ SIN(RADIANS(p.xlat))\
* SIN(RADIANS(latitude)))) AS distance_in_meters\
FROM coordinates\
CROSS JOIN (\
SELECT id AS id_transport,\
station_id,\
xlat,\
xlong,\
type_train\
FROM metro_table\
fetch first 1 rows only\ # Doesn't work in Spark
) AS p ON 1=1\
ORDER BY distance_in_meters"
# Run query
df = sqlContext.sql(query) #1510960 rows
Using Pyspark (fetch first 1 rows only)
w = Window.partitionBy(['uuid', 'latitude', 'longitude']).orderBy('distance_in_meters')
df.select('uuid', 'latitude', 'longitude', xlat, xlong, F.min('distance_in_meters').over(w)).count() #1510960 rows
Merto
---+----------+----------------+----------+----------+
|id |xlong |xlat |station_id|type_train|
+---+----------+----------------+----------+----------+
|1 |-73.668172|45.5552769999931|1 |métro |
|2 |-73.668486|45.5542469999931|2 |métro |
|3 |-73.668225|45.5556069999931|3 |métro |
|4 |-73.667407|45.5561219999931|4 |métro |
+---+----------+----------------+----------+----------+
Coordinates
+-----+---------+----------+
|uuid | latitude| longitude|
+-----+---------+----------+
|1009 | 45.53175| -73.62613|
|1009 | 45.53163| -73.62546|
+-----+---------+----------+
After CROSS JOIN
+----+--------+---------+----------+----------------+----------+----------+-- ----------------+
|uuid|latitude|longitude|station_id| xlat| xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53175|-73.62613| 2|45.5542469999931|-73.668486| metro|237197.13838255248|
|1009|45.53163|-73.62546| 2|45.5542469999931|-73.668486| metro|240044.33000560844|
|1009|45.53175|-73.62613| 1|45.5552769999931|-73.668172| metro| 240121.5093484111|
|1009|45.53175|-73.62613| 4|45.5561219999931|-73.667407| metro|240897.59082511123|
|1009|45.53175|-73.62613| 3|45.5556069999931|-73.668225| metro|241622.85492502493|
|1009|45.53163|-73.62546| 1|45.5552769999931|-73.668172| metro|242937.79388593792|
|1009|45.53163|-73.62546| 4|45.5561219999931|-73.667407| metro| 243679.8807249287|
|1009|45.53163|-73.62546| 3|45.5556069999931|-73.668225| metro| 244431.2963545028|
+----+--------+---------+----------+----------------+----------+----------+------------------+
Desirable results
+----+--------+---------+----------+----------------+----------+----------+------------------+
|uuid|latitude|longitude|station_id| xlat| xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53175|-73.62613| 2|45.5542469999931|-73.668486| metro|237197.13838255248|
|1009|45.53163|-73.62546| 2|45.5542469999931|-73.668486| metro|240044.33000560844|