I have a pandas dataframe with geohashes as indexes along with a column called neighbours which stores a list of neighbours for each geohash. There are a few other column as well with metadata for each geohash as well. The dataframe looks like this:
Geohash (index) | Wave Height | Normalized Wave Height | Speed Factor | Neighbours |
---|---|---|---|---|
u4sj9hz | 0.962316 | 0.361604 | 0.757725 | ['u4sj9hy', 'u4sj9kb', 'u4sj9hx', 'u4sj9hw', 'u4sj9k8', ...] |
u4ezqxn | 0.570723 | 0.214457 | 0.856314 | ['u4ezqxj', 'u4ezqxp', 'u4ezqwy', 'u4ezqwv', 'u4ezqwz', ... |
I need to create a edge_list used for graph creation, at first i did the following:
def create_edge_list(geohash, speed_factor, neighbours):
edge_list = []
for n in neighbours:
distance = haversine_distance(geohash, n)
# distance is in km, speed is in m/s.
speed = 14 * speed_factor
time = round((distance/(speed*3.6))*60, 1)
edge_list.append((geohash, n, {"distance": distance, "time": time}))
return edge_list
for geohash, row in tqdm(df.iterrows(), desc="Creating edge list", total=len(df.index), colour="green"):
edge_list = create_edge_list(geohash, row.speed_factor, row.neighbours)
elist.extend(edge_list)
But this is extremely slow considering i have over 7 million rows. Then i tried using multiprocessing and multithreading trying out both ProcessPoolExecutor and ThreadPoolExecutor, but these did not help much. Any suggestions?
Edit: Seems like i had some errors in the ProcessPoolExecutor, once I fixed that it worked and it did speed it up (took 80 minutes to run down from several hours from just looping through). Also made a sligthly edited minimal reproducible example (notebook)
# Using Python 3.11.2, but works fine for most other newer Python versions
!pip install geopandas
!pip install geohash
!pip install polygeohasher
!pip install shapely
!pip install pandas
!pip install geopandas
!pip install tqdm
import os
import random
from math import cos, sin, asin, sqrt, radian
import geohash as gh
from polygeohasher import polygeohasher
from shapely.wkt import loads
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
def haversine_distance(geohash1, geohash2):
# geohash2 might be a list of neighbors
if isinstance(geohash2, list):
return [round(haversine_distance(geohash1, gh), 3) for gh in geohash2]
lat1, lon1 = gh.decode(geohash1)
lat2, lon2 = gh.decode(geohash2)
lat1, lon1 = (float(lat1), float(lon1))
lat2, lon2 = (float(lat2), float(lon2))
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
return c * r
def create_edge_list(geohash, speed_factor, neighbours):
speed_multiplier = 60 / (3.6 * 14 * speed_factor)
neighbours = list(neighbours)
distances = haversine_distance(geohash, neighbours)
times = [round(d * speed_multiplier, 2) for d in distances]
edge_list = [(geohash, neighbours[i], {"distance": distances[i], "time": times[i]}) for i in range(len(times))]
return edge_list
if __name__ == "__main__":
GEOHASH_PRECISION = 6
# Create polygons using: https://clydedacruz.github.io/openstreetmap-wkt-playground/
polygon_wkt = "POLYGON((9.07196044921875 53.91728101547625,8.25897216796875 52.99495027026802,5.88043212890625 53.20603255157843,5.072937011718749 53.497849543967675,5.913391113281249 53.74221377343122,6.05621337890625 54.004540438503625,8.73687744140625 54.072282655603885,9.07196044921875 53.91728101547625))"
polygon_gdf = gpd.GeoDataFrame(index=[0], crs="EPSG:4326", geometry=[loads(polygon_wkt)])
print("Creating geohash list...")
temp_df = polygeohasher.create_geohash_list(polygon_gdf, GEOHASH_PRECISION, inner=True)
df = pd.DataFrame(temp_df.geohash_list.values.tolist()[0], columns=["geohash"])
df.set_index("geohash", inplace=True)
# just simulate some speed factor for now
df["speed_factor"] = [random.uniform(0.4, 1.0) for i in range(len(df.index))]
neighbours = {geohash: gh.neighbors(geohash) for geohash in df.index}
df["neighbours"] = df.index.map(neighbours)
elist = []
MT = False
print("Creating edge list...")
if MT:
from concurrent.futures import ProcessPoolExecutor
geohash_list = list(df.index)
speed_factor_list = list(df.speed_factor)
neighbours_list = list(df.neighbours)
with tqdm(desc="Creating edge list", total=len(df.index), colour="green") as pbar:
with ProcessPoolExecutor(os.cpu_count()) as executor:
result = executor.map(create_edge_list, geohash_list, speed_factor_list, neighbours_list, chunksize=len(df.index)//(os.cpu_count()))
for edge_list in result:
elist.extend(edge_list)
pbar.update(1)
else:
for geohash, row in tqdm(df.iterrows(), desc="Creating edge list", total=len(df.index), colour="green"):
edge_list = create_edge_list(geohash, row.speed_factor, row.neighbours)
elist.extend(edge_list)