from pyspark.sql.window import Window
import mpu
from pyspark.sql.functions import udf
from pyspark.sql.functions import lag
from math import sin, cos, sqrt, atan2
windowSpec = Window.partitionBy("UserID").orderBy(asc("Timestamp"))
df14=df.withColumn("newLatitude",lag("Latitude",1).over(windowSpec)) \
.withColumn("newLongitude",lag("Longitude",1).over(windowSpec)) \
.drop('AllZero'," Date","Time","Altitude")
df15=df14.orderBy(col("UserID").asc(),col("Timestamp").asc())
df16=df15.na.drop()
from geopy.distance import geodesic
origin = (30.172705, 31.526725) # (latitude, longitude) don't confuse
dist = (30.288281, 31.732326)
print(geodesic(origin, dist).meters)
df17=df16.withColumn("distance",geodesic((col("Latitude"), col("Longitude")), (col("newLatitude"), col("newLongitude"))).meters)
df17.show()
i try to use lag function to get put the previous set of Latitude and Longitude after the original df, but when i try to caculate the distance between these two sets of Latitude and Longitude, it went worong like:
/usr/local/spark/python/pyspark/sql/column.py in nonzero(self) 688 689 def nonzero(self): --> 690 raise ValueError("Cannot convert column into bool: please use '&' for 'and', '|' for 'or', " 691 "'~' for 'not' when building DataFrame boolean expressions.") 692 bool = nonzero
ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions. I really don't understand what was going on.